Loading the Data

In [6]:
%load_ext rpy2.ipython
In [115]:
import pandas

cpu = pandas.read_csv('./sycl-bench-gold-computecpp-opencl.csv',comment='#')
cpu = pandas.merge(cpu, pandas.read_csv('./sycl-bench-gold-computecpp-cpu.csv',comment='#'),how='outer')
cpu = pandas.merge(cpu, pandas.read_csv('./sycl-bench-gold-dpc++-cpu.csv',comment='#'),how='outer')
cpu = pandas.merge(cpu, pandas.read_csv('./sycl-bench-gold-hipsycl-cpu.csv',comment='#'),how='outer')
cpu = pandas.merge(cpu, pandas.read_csv('./sycl-bench-gold-trisycl-cpu.csv',comment='#'),how='outer')

gpu = pandas.read_csv('./sycl-bench-p100-dpc++-cuda.csv',comment='#')
gpu = pandas.merge(gpu, pandas.read_csv('./sycl-bench-p100-hipsycl-cuda.csv',comment='#'),how='outer')
gpu = pandas.merge(gpu, pandas.read_csv('./sycl-bench-gfx906-hipsycl-rocm.csv',comment='#'),how='outer')

all_res = pandas.merge(gpu,cpu,how='outer')
print("using sample size of:",len(list(map(float,cpu['run-time-samples'][0].split()))),"elements per data-point")    
using sample size of: 50 elements per data-point

Restructure dataframe to split up the run-time-samples into separate run-time-sample by duplicating each row with a unique sample -- this is for R to do the heavy-lifting by generating the box-and-whisker plots and summary statistics.

In [116]:
from tqdm import tqdm

import os.path
from os import path
if path.exists("./outdat.pkl"):
    # read previously created pickle file if it exists
    outdat = pandas.read_pickle("./outdat.pkl")
else:
    outdat = pandas.DataFrame()

    for index, row in tqdm(all_res.iterrows(),total=all_res.shape[0]):
        samples = row['run-time-samples']
        x = samples.split(' ')
        for y in x:
            tmprow = row
            tmprow['run-time-sample'] = float(y)
            outdat = outdat.append(tmprow)
    
    outdat = outdat.drop(columns=['run-time-samples'])
    outdat.to_pickle("./outdat.pkl")

all_res = outdat
100%|██████████| 1342/1342 [12:51<00:00,  1.74it/s]
In [117]:
all_res
Out[117]:
Benchmark name Verification device-name kernel-time-mean kernel-time-median kernel-time-min kernel-time-samples kernel-time-stddev kernel-time-throughput local-size problem-size run-time-mean run-time-median run-time-min run-time-sample run-time-stddev run-time-throughput sycl-implementation throughput-metric
0 MicroBench_HostDeviceBandwidth_1D_H2D_Contiguous PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 1024.0 0.230399 0.224118 0.223125 0.223125 0.018842 4.481796 LLVM CUDA (Codeplay) 1.0
0 MicroBench_HostDeviceBandwidth_1D_H2D_Contiguous PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 1024.0 0.230399 0.224118 0.223125 0.223125 0.018842 4.481796 LLVM CUDA (Codeplay) 1.0
0 MicroBench_HostDeviceBandwidth_1D_H2D_Contiguous PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 1024.0 0.230399 0.224118 0.223125 0.223144 0.018842 4.481796 LLVM CUDA (Codeplay) 1.0
0 MicroBench_HostDeviceBandwidth_1D_H2D_Contiguous PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 1024.0 0.230399 0.224118 0.223125 0.223151 0.018842 4.481796 LLVM CUDA (Codeplay) 1.0
0 MicroBench_HostDeviceBandwidth_1D_H2D_Contiguous PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 1024.0 0.230399 0.224118 0.223125 0.223216 0.018842 4.481796 LLVM CUDA (Codeplay) 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1341 MicroBench_DRAM_fp32_1 PASS unknown NaN NaN NaN NaN NaN NaN 256.0 1024.0 1.005815 1.001539 0.982869 1.034844 0.019687 2.034859 triSYCL 2.0
1341 MicroBench_DRAM_fp32_1 PASS unknown NaN NaN NaN NaN NaN NaN 256.0 1024.0 1.005815 1.001539 0.982869 1.035934 0.019687 2.034859 triSYCL 2.0
1341 MicroBench_DRAM_fp32_1 PASS unknown NaN NaN NaN NaN NaN NaN 256.0 1024.0 1.005815 1.001539 0.982869 1.038609 0.019687 2.034859 triSYCL 2.0
1341 MicroBench_DRAM_fp32_1 PASS unknown NaN NaN NaN NaN NaN NaN 256.0 1024.0 1.005815 1.001539 0.982869 1.045763 0.019687 2.034859 triSYCL 2.0
1341 MicroBench_DRAM_fp32_1 PASS unknown NaN NaN NaN NaN NaN NaN 256.0 1024.0 1.005815 1.001539 0.982869 1.085657 0.019687 2.034859 triSYCL 2.0

63688 rows × 19 columns

We also add the Runtime variable which is named according to the sycl-implementation and device-name. This is needed because all CPU backends cannot query the device name.

In [207]:
def clear_up_runtime (row):
    if row['device-name'] == "Device 66af" and row['sycl-implementation'] == "hipSYCL":
        return "hipSYCL ROCm - gfx906" # (gfx906)
    elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "hipSYCL":
        return "hipSYCL CUDA - P100"
    elif row['device-name'] == "hipCPU OpenMP host device" and row['sycl-implementation'] == "hipSYCL":
        return "hipSYCL OpenMP - Gold"    
    elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "LLVM CUDA (Codeplay)":
        return "DPC++ CUDA - P100"
    elif row['device-name'] == "SYCL host device" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
        return "DPC++ pthreads - Gold"

    
    #todo: generate and check this one:
    elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
        return "DPC++ OpenCL - Gold"
    
    
    elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'ComputeCpp':
        return "ComputeCpp OpenCL - Gold"
    elif row['device-name'] == "Host Device" and row['sycl-implementation'] == 'ComputeCpp':
        return "ComputeCpp pthreads - Gold"
    elif row['device-name'] == 'unknown' and row['sycl-implementation'] == 'triSYCL':
        return "triSYCL OpenMP - Gold"
    
all_res['Runtime'] = all_res.apply (lambda row: clear_up_runtime(row), axis=1)

Convert these runtimes to factors.

In [208]:
%%R -i all_res -o all_res

all_res$Runtime <- as.factor(all_res$Runtime)
/usr/local/lib/python3.6/dist-packages/rpy2/robjects/pandas2ri.py:63: UserWarning: Error while trying to convert the column "Verification". Fall back to string conversion. The error is: Series can only be of one type, or None (and here we have <class 'str'> and <class 'rpy2.rinterface_lib.sexp.NACharacterType'>).
  % (name, str(e)))

Permanently assign colour to each runtime -- to avoid confusion and colour reuse when plots are broken down into types of accelerator.

In [209]:
%%R -i all_res -o colour_scale -o all_res

# While viridis is a great colour palette, we need high contrast between neighouring elements --like the rainbow palette-- but still need to be colour-blind friendly.
#library('viridisLite')
#colours <- viridisLite::viridis(length(unique(all_res$Runtime)))

#library(RColorBrewer)
#colours <- brewer.pal(length(unique(all_res$Runtime)),'Dark2')

#assign an order to the way SYCL runtimes are presented
all_res$Runtime <- factor(all_res$Runtime, levels = c(
  "ComputeCpp pthreads - Gold",
  "DPC++ pthreads - Gold",
  "hipSYCL OpenMP - Gold",
  "triSYCL OpenMP - Gold",
  "ComputeCpp OpenCL - Gold",
  "DPC++ CUDA - P100",
  "hipSYCL CUDA - P100",
  "hipSYCL ROCm - gfx906"
 ))

#all_res$Runtime <- factor(all_res$Runtime, levels = c("ComputeCPP pthreads - Gold", "DPC++ pthreads - Gold", "hipSYCL OpenMP - Gold", "triSYCL OpenMP - Gold",  "ComputeCPP OpenCL - Gold", "DPC++ OpenCL - Gold", "DPC++ CUDA - P100","hipSYCL CUDA - P100", "hipSYCL ROCm - gfx906"))

library(scales)
colours <- hue_pal()(length(unique(all_res$Runtime)))

names(colours) <- levels(all_res$Runtime)
colour_scale <- scale_colour_manual(name = "Runtime",values = colours)

Overview of Results

In [153]:
indat = all_res
In [210]:
%%R -i indat -i colour_scale -w 10 -h 10 --units in -r 200

library('ggplot2')
library('cowplot')
names(indat) <- make.names(names(indat), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(indat, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + geom_boxplot() + colour_scale

p1

This is obviously too crowded to make any sense of, so let's divide according to SYCL execution construct.

In [155]:
#list all available benchmark names
all_res['Benchmark name'].unique()

roi_wgp = all_res['Benchmark name'].isin(['LinearRegressionCoeff_fp32','LinearRegressionCoeff_fp64','MicroBench_LocalMem_int32_4096','MicroBench_LocalMem_fp32_4096','MicroBench_LocalMem_fp64_4096','NBody_NDRange_fp32','NBody_NDRange_fp64','Pattern_Reduction_NDRange_int32','Pattern_Reduction_NDRange_int64','Pattern_Reduction_NDRange_fp32','Pattern_Reduction_NDRange_fp64','Pattern_SegmentedReduction_NDRange_int16','Pattern_SegmentedReduction_NDRange_int32','Pattern_SegmentedReduction_NDRange_int64','Pattern_SegmentedReduction_NDRange_fp32','Pattern_SegmentedReduction_NDRange_fp64','ScalarProduct_NDRange_int32','ScalarProduct_NDRange_int64','ScalarProduct_NDRange_fp32','ScalarProduct_NDRange_fp64','Runtime_DAGTaskThroughput_NDRangeParallelFor','Runtime_IndependentDAGTaskThroughput_NDRangeParallelFor'])
wgp = all_res[roi_wgp]

roi_hdp = all_res['Benchmark name'].isin(['Runtime_IndependentDAGTaskThroughput_HierarchicalParallelFor','Runtime_DAGTaskThroughput_HierarchicalParallelFor','NBody_Hierarchical_fp32','NBody_Hierarchical_fp64','Pattern_Reduction_Hierarchical_int32','Pattern_Reduction_Hierarchical_int64','Pattern_Reduction_Hierarchical_fp32','Pattern_Reduction_Hierarchical_fp64','Pattern_SegmentedReduction_Hierarchical_int16','Pattern_SegmentedReduction_Hierarchical_int32','Pattern_SegmentedReduction_Hierarchical_int64','Pattern_SegmentedReduction_Hierarchical_fp32','Pattern_SegmentedReduction_Hierarchical_fp64','ScalarProduct_Hierarchical_int32','ScalarProduct_Hierarchical_int64','ScalarProduct_Hierarchical_fp32','ScalarProduct_Hierarchical_fp64'])
hdp = all_res[roi_hdp]

roi_task = all_res['Benchmark name'].isin(['Runtime_IndependentDAGTaskThroughput_SingleTask','Runtime_DAGTaskThroughput_SingleTask'])
task = all_res[roi_task]

roi_sync = all_res['Benchmark name'].isin(['LinearRegressionCoeff_fp32','LinearRegressionCoeff_fp64','MicroBench_LocalMem_int32_4096','MicroBench_LocalMem_fp32_4096','MicroBench_LocalMem_fp64_4096','NBody_NDRange_fp32','NBody_NDRange_fp64','Pattern_Reduction_NDRange_int32','Pattern_Reduction_NDRange_int64','Pattern_Reduction_NDRange_fp32','Pattern_Reduction_NDRange_fp64','Pattern_SegmentedReduction_NDRange_int16','Pattern_SegmentedReduction_NDRange_int32','Pattern_SegmentedReduction_NDRange_int64','Pattern_SegmentedReduction_NDRange_fp32','Pattern_SegmentedReduction_NDRange_fp64','ScalarProduct_NDRange_int32','ScalarProduct_NDRange_int64', 'ScalarProduct_NDRange_fp32','ScalarProduct_NDRange_fp64'])
sync = all_res[roi_sync]

roi_wgp =  [ not x for x in roi_wgp]
roi_hdp =  [ not x for x in roi_hdp]
roi_task = [ not x for x in roi_task]
roi_sync = [ not x for x in roi_sync]
roi_bkp =  [a and b and c and d for a, b, c, d in zip(roi_wgp, roi_hdp, roi_task, roi_sync)]
bkp = all_res[roi_bkp]

Now also chomp of the leading MicroBench_, Runtime_ and Pattern_ -- since the type of computation only lengthens the variable names in the plots

In [156]:
def chompLeadingSYCLBenchExperimentType(_dataframe):
    _dataframe['Benchmark name'] = _dataframe['Benchmark name'].str.replace("MicroBench_","")
    _dataframe['Benchmark name'] = _dataframe['Benchmark name'].str.replace("Runtime_","")
    _dataframe['Benchmark name'] = _dataframe['Benchmark name'].str.replace("Pattern_","")
    return(_dataframe)

bkp = chompLeadingSYCLBenchExperimentType(bkp)
bkp
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
Out[156]:
Benchmark name Verification device-name kernel-time-mean kernel-time-median kernel-time-min kernel-time-samples kernel-time-stddev kernel-time-throughput local-size problem-size run-time-mean run-time-median run-time-min run-time-sample run-time-stddev run-time-throughput sycl-implementation throughput-metric Runtime
1 HostDeviceBandwidth_1D_H2D_Contiguous PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 1024.0 0.230399 0.224118 0.223125 0.223125 0.018842 4.481796 LLVM CUDA (Codeplay) 1.0 DPC++ CUDA - P100
2 HostDeviceBandwidth_1D_H2D_Contiguous PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 1024.0 0.230399 0.224118 0.223125 0.223125 0.018842 4.481796 LLVM CUDA (Codeplay) 1.0 DPC++ CUDA - P100
3 HostDeviceBandwidth_1D_H2D_Contiguous PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 1024.0 0.230399 0.224118 0.223125 0.223144 0.018842 4.481796 LLVM CUDA (Codeplay) 1.0 DPC++ CUDA - P100
4 HostDeviceBandwidth_1D_H2D_Contiguous PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 1024.0 0.230399 0.224118 0.223125 0.223151 0.018842 4.481796 LLVM CUDA (Codeplay) 1.0 DPC++ CUDA - P100
5 HostDeviceBandwidth_1D_H2D_Contiguous PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 1024.0 0.230399 0.224118 0.223125 0.223216 0.018842 4.481796 LLVM CUDA (Codeplay) 1.0 DPC++ CUDA - P100
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
63684 DRAM_fp32_1 PASS unknown NaN NaN NaN NaN NaN NaN 256.0 1024.0 1.005815 1.001539 0.982869 1.034844 0.019687 2.034859 triSYCL 2.0 triSYCL OpenMP - Gold
63685 DRAM_fp32_1 PASS unknown NaN NaN NaN NaN NaN NaN 256.0 1024.0 1.005815 1.001539 0.982869 1.035934 0.019687 2.034859 triSYCL 2.0 triSYCL OpenMP - Gold
63686 DRAM_fp32_1 PASS unknown NaN NaN NaN NaN NaN NaN 256.0 1024.0 1.005815 1.001539 0.982869 1.038609 0.019687 2.034859 triSYCL 2.0 triSYCL OpenMP - Gold
63687 DRAM_fp32_1 PASS unknown NaN NaN NaN NaN NaN NaN 256.0 1024.0 1.005815 1.001539 0.982869 1.045763 0.019687 2.034859 triSYCL 2.0 triSYCL OpenMP - Gold
63688 DRAM_fp32_1 PASS unknown NaN NaN NaN NaN NaN NaN 256.0 1024.0 1.005815 1.001539 0.982869 1.085657 0.019687 2.034859 triSYCL 2.0 triSYCL OpenMP - Gold

43170 rows × 20 columns

BKP -- Basic Data-Parallel Kernels

First we extract the kernels which utilize BKP. There are too many results to present in a single plot.

In [157]:
indat = bkp
In [158]:
%%R -i indat -i colour_scale -w 10 -h 10 --units in -r 200

library('ggplot2')
library('cowplot')
names(indat) <- make.names(names(indat), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(indat, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + geom_boxplot() + colour_scale

p1
/usr/local/lib/python3.6/dist-packages/rpy2/robjects/pandas2ri.py:63: UserWarning: Error while trying to convert the column "Verification". Fall back to string conversion. The error is: Series can only be of one type, or None (and here we have <class 'str'> and <class 'rpy2.rinterface_lib.sexp.NACharacterType'>).
  % (name, str(e)))

Still too busy... let's break it down to data-type each benchmark operates on.

In [159]:
#subset just kernels with a verified passing result.
bkp = bkp[bkp['Verification'] == "PASS"]

bkp['Benchmark name'].unique()
Out[159]:
array(['HostDeviceBandwidth_1D_H2D_Contiguous',
       'HostDeviceBandwidth_2D_H2D_Contiguous',
       'HostDeviceBandwidth_3D_H2D_Contiguous',
       'HostDeviceBandwidth_1D_D2H_Contiguous',
       'HostDeviceBandwidth_3D_D2H_Contiguous',
       'HostDeviceBandwidth_1D_H2D_Strided',
       'BlockedTransform_iter_64_blocksize_256',
       'BlockedTransform_iter_128_blocksize_256',
       'BlockedTransform_iter_256_blocksize_256',
       'BlockedTransform_iter_512_blocksize_256',
       'BlockedTransform_iter_64_blocksize_512',
       'BlockedTransform_iter_128_blocksize_512',
       'BlockedTransform_iter_256_blocksize_512',
       'BlockedTransform_iter_512_blocksize_512',
       'BlockedTransform_iter_64_blocksize_1024',
       'BlockedTransform_iter_128_blocksize_1024',
       'BlockedTransform_iter_256_blocksize_1024',
       'BlockedTransform_iter_512_blocksize_1024',
       'BlockedTransform_iter_64_blocksize_2048',
       'BlockedTransform_iter_128_blocksize_2048',
       'BlockedTransform_iter_256_blocksize_2048',
       'BlockedTransform_iter_512_blocksize_2048',
       'BlockedTransform_iter_64_blocksize_4096',
       'BlockedTransform_iter_128_blocksize_4096',
       'BlockedTransform_iter_256_blocksize_4096',
       'BlockedTransform_iter_512_blocksize_4096',
       'BlockedTransform_iter_64_blocksize_8192',
       'BlockedTransform_iter_128_blocksize_8192',
       'BlockedTransform_iter_256_blocksize_8192',
       'BlockedTransform_iter_512_blocksize_8192',
       'BlockedTransform_iter_64_blocksize_16384',
       'BlockedTransform_iter_128_blocksize_16384',
       'BlockedTransform_iter_256_blocksize_16384',
       'BlockedTransform_iter_512_blocksize_16384',
       'BlockedTransform_iter_64_blocksize_32768',
       'BlockedTransform_iter_128_blocksize_32768',
       'BlockedTransform_iter_256_blocksize_32768',
       'BlockedTransform_iter_512_blocksize_32768',
       'BlockedTransform_iter_64_blocksize_65536',
       'BlockedTransform_iter_128_blocksize_65536',
       'BlockedTransform_iter_256_blocksize_65536',
       'BlockedTransform_iter_512_blocksize_65536',
       'BlockedTransform_iter_64_blocksize_131072',
       'BlockedTransform_iter_128_blocksize_131072',
       'BlockedTransform_iter_256_blocksize_131072',
       'BlockedTransform_iter_512_blocksize_131072',
       'BlockedTransform_iter_64_blocksize_262144',
       'BlockedTransform_iter_128_blocksize_262144',
       'BlockedTransform_iter_256_blocksize_262144',
       'BlockedTransform_iter_512_blocksize_262144',
       'BlockedTransform_iter_64_blocksize_524288',
       'BlockedTransform_iter_128_blocksize_524288',
       'BlockedTransform_iter_256_blocksize_524288',
       'BlockedTransform_iter_512_blocksize_524288', 'Polybench_Mvt',
       'LinearRegression_fp32', 'LinearRegression_fp64',
       'Polybench_3DConvolution', 'Polybench_Gramschmidt',
       'Polybench_Atax', 'Polybench_2mm', 'Polybench_Gesummv',
       'MatmulChain', 'VectorAddition_int32', 'VectorAddition_int64',
       'VectorAddition_fp32', 'VectorAddition_fp64', 'Polybench_Bicg',
       'IndependentDAGTaskThroughput_BasicParallelFor', 'Polybench_Gemm',
       'MolecularDynamics', 'Kmeans_fp32', 'Kmeans_fp64', 'Polybench_3mm',
       'Polybench_Syrk', 'Polybench_2DConvolution',
       'Polybench_Covariance', 'Polybench_Fdtd2d', 'Polybench_Syr2k',
       'DAGTaskThroughput_BasicParallelFor', 'DRAM_fp32_1', 'DRAM_fp32_3',
       'DRAM_fp64_1', 'DRAM_fp64_3',
       'HostDeviceBandwidth_2D_D2H_Contiguous',
       'HostDeviceBandwidth_2D_H2D_Strided',
       'HostDeviceBandwidth_3D_H2D_Strided',
       'HostDeviceBandwidth_1D_D2H_Strided',
       'HostDeviceBandwidth_2D_D2H_Strided',
       'HostDeviceBandwidth_3D_D2H_Strided', 'sf_fp32_16', 'sf_fp64_16',
       'Polybench_Correlation', 'Arith_int32_512', 'Arith_fp32_512',
       'Arith_fp64_512', 'Sobel3', 'Sobel7', 'DRAM_fp32_2', 'DRAM_fp64_2',
       'Sobel5', 'MedianFilter'], dtype=object)

BKP Divided by Data-Type

Float

In [211]:
#subset just kernels with a verified passing result.
bkp = bkp[bkp['Verification'] == "PASS"]

#we could divide this data-set either by the number of invocations used in application or by the data-type worked with -- we choose the latter.
bkp_fp32 = bkp[bkp['Benchmark name'].str.contains("fp32")]
bkp_fp64 = bkp[bkp['Benchmark name'].str.contains("fp64")]
bkp_int32 = bkp[bkp['Benchmark name'].str.contains("int32")]
bkp_int64 = bkp[bkp['Benchmark name'].str.contains("int64")]

indat = bkp_fp32
In [212]:
%%R -i bkp_fp32 -i bkp_fp64 -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(bkp_fp32) <- make.names(names(bkp_fp32), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(bkp_fp32, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_fp32, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

names(bkp_fp64) <- make.names(names(bkp_fp64), unique = FALSE, allow_ = TRUE)

p3 <- ggplot(bkp_fp64, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p4 <- ggplot(bkp_fp64, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                p3 + theme(legend.position="none"),
                p4 + theme(legend.position="none"),
                labels = c('Float32', '','Float64',''), label_size = 10, align = 'vh', hjust = -2, nrow = 2)

#side legend
#legend <- get_legend(p1 + theme(legend.box.margin = margin(0, 0, 0, 0)))
#plot_grid(pg, legend, rel_widths = c(3, .85))

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
In [215]:
%%R -i bkp_fp32 -i colour_scale -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(bkp_fp32) <- make.names(names(bkp_fp32), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(bkp_fp32, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('bkp-float.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)

Int

In [178]:
%%R -i bkp_int32 -i bkp_int64 -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.50

library('ggplot2')
library('latex2exp')

names(bkp_int32) <- make.names(names(bkp_int32), unique = FALSE, allow_ = TRUE)

p5 <- ggplot(bkp_int32, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p6 <- ggplot(bkp_int32, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

names(bkp_int64) <- make.names(names(bkp_int64), unique = FALSE, allow_ = TRUE)

p7 <- ggplot(bkp_int64, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size)  + colour_scale
p8 <- ggplot(bkp_int64, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale



library('cowplot')
pg <- plot_grid(p5 + theme(legend.position="none"),
                p6 + theme(legend.position="none"),
                p7 + theme(legend.position="none"),
                p8 + theme(legend.position="none"),
                labels = c('Int32','','Int64',''), label_size = 10, align = 'vh', hjust = -2, nrow = 2)

#side legend
#legend <- get_legend(p1 + theme(legend.box.margin = margin(0, 0, 0, 0)))
#plot_grid(pg, legend, rel_widths = c(3, .85))

#bottom legend
legend <- get_legend(p5 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)

Other BKP Kernels without data-types

In [187]:
#subset just kernels with a verified passing result.
bkp = bkp[bkp['Verification'] == "PASS"]

bkp_block = bkp[bkp['Benchmark name'].str.contains("BlockedTransform_iter_")]
bkp_bandw = bkp[bkp['Benchmark name'].str.contains("HostDeviceBandwidth_")]

#we could divide this data-set either by the number of invocations used in application or by the data-type worked with -- we choose the latter.
bkp_other = bkp[~(bkp['Benchmark name'].str.contains("fp32") | bkp['Benchmark name'].str.contains("fp64") | bkp['Benchmark name'].str.contains("int32") | bkp['Benchmark name'].str.contains("int64") | bkp['Benchmark name'].str.contains("BlockedTransform_iter_") | bkp['Benchmark name'].str.contains("HostDeviceBandwidth_"))]
In [180]:
#discard all but the largest problem size for DagTaskThroughput kernels
bkp_other = bkp_other[ (~ bkp_other['Benchmark name'].str.contains('DAGTaskThroughput')) | (bkp_other['Benchmark name'].str.contains('DAGTaskThroughput') & (bkp_other['problem-size'] == 65536))]
In [181]:
%%R -i bkp_other -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(bkp_other) <- make.names(names(bkp_other), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(bkp_other, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size)  + colour_scale
p2 <- ggplot(bkp_other, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size)  + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)

This is still too noisy to readily compare, so let's separate the results by each device thereby we can directly compare the SYCL runtimes (implementation paired with backend) on the same hardware.

In [184]:
bkp_other_gpu = bkp_other[bkp_other['Runtime'].str.contains("P100") | bkp_other['Runtime'].str.contains("gfx906")]
bkp_other_cpu = bkp_other[bkp_other['Runtime'].str.contains("Gold")]
In [185]:
%%R -i bkp_other_gpu -i colour_scale -i bkp_other_cpu -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(bkp_other_cpu) <- make.names(names(bkp_other_cpu), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(bkp_other_cpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_other_cpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

names(bkp_other_gpu) <- make.names(names(bkp_other_gpu), unique = FALSE, allow_ = TRUE)

p3 <- ggplot(bkp_other_gpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p4 <- ggplot(bkp_other_gpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                get_legend(p1 + guides(color = guide_legend(ncol = 1)) + theme(legend.position = "right")),
                p3 + theme(legend.position="none"),
                p4 + theme(legend.position="none"),
                get_legend(p3 + guides(color = guide_legend(ncol = 1)) + theme(legend.position = "right")),
                labels = c("Other BKP on CPUs","","","Other BKPs on GPUs","",""),
                align = 'hv', hjust = -1.85, vjust = -.8, nrow = 2,
                rel_widths = c(1,1,.33))

pg

BKP -- Blocked Transform

From the source-code (runtime/blocked_transform.cpp:39-49): (The blocked transform) performs a blocked transform operation using the mandelbrot sequence as kernels. The number of iterations of the sequence -- and hence the runtime of the kernel can be adjusted using Num_iterations. This benchmark processes the data in chunks that are assigned to independent kernels, therefore this benchmark tests

  • Overlapping of compute and data transfers
  • concurrent kernel execution
  • if the implementation of ranged accessors creates independent accesses if accessed ranges are non-overlapping.

In order for the benchmark to stress these aspects, Num_iterations should be tuned such that the kernel runtime is similar to the data transfer time of one block.

In [188]:
%%R -i bkp_block -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(bkp_block) <- make.names(names(bkp_block), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(bkp_block, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_block, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)

Sorted by iteration and block-size -- and shorten the name

In [189]:
bkp_block['Benchmark name']

bkp_block['iter'] = bkp_block['Benchmark name'].str.extract(r'iter_(\d+)_')
bkp_block['blocksize'] = bkp_block['Benchmark name'].str.extract(r'blocksize_(\d+)')

bkp_block
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
Out[189]:
Benchmark name Verification device-name kernel-time-mean kernel-time-median kernel-time-min kernel-time-samples kernel-time-stddev kernel-time-throughput local-size ... run-time-median run-time-min run-time-sample run-time-stddev run-time-throughput sycl-implementation throughput-metric Runtime iter blocksize
304 BlockedTransform_iter_64_blocksize_256 PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 ... 0.061605 0.058476 0.058476 0.012554 NaN LLVM CUDA (Codeplay) NaN DPC++ CUDA - P100 64 256
305 BlockedTransform_iter_64_blocksize_256 PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 ... 0.061605 0.058476 0.058760 0.012554 NaN LLVM CUDA (Codeplay) NaN DPC++ CUDA - P100 64 256
306 BlockedTransform_iter_64_blocksize_256 PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 ... 0.061605 0.058476 0.058920 0.012554 NaN LLVM CUDA (Codeplay) NaN DPC++ CUDA - P100 64 256
307 BlockedTransform_iter_64_blocksize_256 PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 ... 0.061605 0.058476 0.059056 0.012554 NaN LLVM CUDA (Codeplay) NaN DPC++ CUDA - P100 64 256
308 BlockedTransform_iter_64_blocksize_256 PASS Tesla P100-PCIE-12GB NaN NaN NaN NaN NaN NaN 256.0 ... 0.061605 0.058476 0.059061 0.012554 NaN LLVM CUDA (Codeplay) NaN DPC++ CUDA - P100 64 256
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
53955 BlockedTransform_iter_512_blocksize_524288 PASS hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 256.0 ... 0.205413 0.146830 0.260136 0.035568 NaN hipSYCL NaN hipSYCL OpenMP - Gold 512 524288
53956 BlockedTransform_iter_512_blocksize_524288 PASS hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 256.0 ... 0.205413 0.146830 0.263489 0.035568 NaN hipSYCL NaN hipSYCL OpenMP - Gold 512 524288
53957 BlockedTransform_iter_512_blocksize_524288 PASS hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 256.0 ... 0.205413 0.146830 0.269154 0.035568 NaN hipSYCL NaN hipSYCL OpenMP - Gold 512 524288
53958 BlockedTransform_iter_512_blocksize_524288 PASS hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 256.0 ... 0.205413 0.146830 0.270207 0.035568 NaN hipSYCL NaN hipSYCL OpenMP - Gold 512 524288
53959 BlockedTransform_iter_512_blocksize_524288 PASS hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 256.0 ... 0.205413 0.146830 0.275260 0.035568 NaN hipSYCL NaN hipSYCL OpenMP - Gold 512 524288

16800 rows × 22 columns

In [190]:
%%R -i bkp_block -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(bkp_block) <- make.names(names(bkp_block), unique = FALSE, allow_ = TRUE)
bkp_block$blocksize <- reorder(bkp_block$blocksize, as.numeric(bkp_block$blocksize))


p1 <- ggplot(bkp_block, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_block, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)

It appears GPU devices perform better over larger blocksizes whereas CPU devices are more affected by increasing the blocksize.

The use of the common CPU backend limits the performance on the Xeon Gold --shown in both the ComputeCpp/CPU and DPC++/CPU runtimes-- which were equally ~an order of magnitude worse than the OpenMP and OpenCL backends. OpenCL had the best performance of any of the backends on the Xeon Gold device -- shown in the ComputeCpp/OpenCL runtime. The hipSYCL/OpenMP has the most variance was the 2nd best performer over all blocksizes.

On the Tesla P100 the CUDA backend performs well in general, as the blocksize increases the hipSYCL implementation slightly wins out over DPC++. The Vega performs worse out the GPUs on this benchmark, however, the performance gap between the common hipSYCL implementation stays the same as the block-sizes increase -- so we can credit this to comparing unequal hardware (with different specifications).

We now also perform a breakdown to examine whether the other variable iter affects the difference in performance on blocksize.

In [191]:
%%R -i bkp_block -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(bkp_block) <- make.names(names(bkp_block), unique = FALSE, allow_ = TRUE)
bkp_block$blocksize <- reorder(bkp_block$blocksize, as.numeric(bkp_block$blocksize))
bkp_block$iter <- reorder(bkp_block$iter, as.numeric(bkp_block$iter))
#rename for plotting
levels(bkp_block$iter) <- paste(levels(bkp_block$iter),"iterations")

p1 <- ggplot(bkp_block, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ iter, strip.position = "top", scales = "free_x") + colour_scale
p2 <- ggplot(bkp_block, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ iter, strip.position = "top", scales = "free_x") + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)

As expected by increasing the number of iterations of each test that is run, the execution time also increases. This has the largest impact on the CPU backends because these were the worst performing devices -- thus the penalty they incur is highlighted by increasing the amount of work required in each test.

By breaking this down into their respective iterations we can remove a large amount of variance from the previous plot -- this is shown by the much smaller range of the upper and lower quartiles in the box-plots. We also see less variance with the increased sample size.

As such we select the maximum number of iterations to present in our final results.

In [192]:
bkp_block512 = bkp_block[bkp_block['Benchmark name'].str.contains("iter_512")]
In [193]:
%%R -i bkp_block512 -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(bkp_block512) <- make.names(names(bkp_block512), unique = FALSE, allow_ = TRUE)
bkp_block512$blocksize <- reorder(bkp_block512$blocksize, as.numeric(bkp_block512$blocksize))


p1 <- ggplot(bkp_block512, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_block512, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
pgw <- plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
print(pgw)

ggsave('figs/bkp_block512.pdf',pgw, width = 8.3, height = 11.7, dpi = 300, units = "in")
In [217]:
%%R -i bkp_block512 -i colour_scale -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(bkp_block512) <- make.names(names(bkp_block512), unique = FALSE, allow_ = TRUE)
bkp_block512$blocksize <- reorder(bkp_block512$blocksize, as.numeric(bkp_block512$blocksize))

p1 <- ggplot(bkp_block512, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('blocked-transform.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)

This figure presents the performance of the Blocksize test on two GPU devices --the Nvidia Tesla P100 and AMD Vega 20-- and the Xeon Gold CPU. The test measures concurrent kernel execution and overlapping of compute and data transfers. The number of iterations was selected to be 512. The blocksize increases over the x-axis. It appears GPU devices perform better over larger blocksizes whereas CPU devices are more affected by increasing the blocksize.

The use of the common CPU backend limits the performance on the Xeon Gold --shown in both the ComputeCpp/CPU and DPC++/CPU runtimes-- which were equally ~an order of magnitude worse than the OpenMP and OpenCL backends. OpenCL had the best performance of any of the backends on the Xeon Gold device -- shown in the ComputeCpp/OpenCL runtime. The hipSYCL/OpenMP has the most variance was the 2nd best performer over all blocksizes.

On the Tesla P100 the CUDA backend performs well in general, as the blocksize increases the hipSYCL implementation slightly wins out over DPC++. The Vega performs worse out the GPUs on this benchmark, however, the performance gap between the common hipSYCL implementation stays the same as the block-sizes increase, and stays in the same order of magnitude -- so we can credit this to comparing unequal hardware (with different specifications of the P100 and the Vega 20).

TODO: investigate block performance difference between CPU (pthreads?) vs Other backends Too heavyweight to be suited to this SYCL test?.

BKP -- Bandwidth

From the source-code (micro/host_device_bandwidth.cpp:53-64):

Microbenchmark measuring host<->device bandwidth for contiguous and strided copies. For non-strided copies we use a dummy kernel, as explicit copy operations are not fully supported by some SYCL implementations. Strided copies use a larger SYCL buffer and copy a portion out of the middle. For example, a (512, 512) element 2D-copy at offset (1, 1) out of a (514, 514) element SYCL buffer. The host buffer is never strided (as this is not supported by SYCL 1.2.1).

To avoid SYCL implementations to just copy the entire buffer when using a strided accessor we use explicit copy operations for strided copies.

In [194]:
#shorten kernel names
bkp_bandw = bkp[bkp['Benchmark name'].str.contains("HostDeviceBandwidth_")]

bkp_bandw['Benchmark name'] = bkp_bandw['Benchmark name'].str.replace('HostDeviceBandwidth_','')
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
In [206]:
%%R -i bkp_bandw -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(bkp_bandw) <- make.names(names(bkp_bandw), unique = FALSE, allow_ = TRUE)
#bkp_bandw$Benchmark.name <- reorder(bkp_bandw$Benchmark.name, as.numeric(bkp_bandw$blocksize))


p1 <- ggplot(bkp_bandw, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_bandw, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
In [219]:
%%R -i bkp_bandw -i colour_scale -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(bkp_bandw) <- make.names(names(bkp_bandw), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(bkp_bandw, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('bandwidth.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)

It appears that communication takes 4 orders of magnitude longer on the GPU than the CPU -- this is unsurprising since it should only be a SYCL API call on the CPU host to transfer to the CPU device whereas the GPU has to go over PCI-E. Let's split the data by device.

In [196]:
bkp_bandw_gpu = bkp_bandw[bkp_bandw['Runtime'].str.contains("P100") | bkp_bandw['Runtime'].str.contains("gfx906")]
bkp_bandw_cpu = bkp_bandw[bkp_bandw['Runtime'].str.contains("Gold")]
In [197]:
%%R -i bkp_bandw_gpu -i bkp_bandw_cpu -i colour_scale -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(bkp_bandw_cpu) <- make.names(names(bkp_bandw_cpu), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(bkp_bandw_cpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_bandw_cpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

names(bkp_bandw_gpu) <- make.names(names(bkp_bandw_gpu), unique = FALSE, allow_ = TRUE)

p3 <- ggplot(bkp_bandw_gpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p4 <- ggplot(bkp_bandw_gpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                get_legend(p1 + guides(color = guide_legend(ncol = 1)) + theme(legend.position = "right")),
                p3 + theme(legend.position="none"),
                p4 + theme(legend.position="none"),
                get_legend(p3 + guides(color = guide_legend(ncol = 1)) + theme(legend.position = "right")),
                labels = c("BKP bandwidth test on CPUs","","","BKP bandwidth test on GPUs","",""),
                align = 'hv', hjust = -1.85, vjust = -.8, nrow = 2,
                rel_widths = c(1,1,.33))

pg

A direct comparison on the GPU architectures can be made based on suitability of SYCL implementations. For instance,

When we consider

TODO: Summary.

WGP -- Work-Group Data-Parallel Kernels

In [242]:
wgp = chompLeadingSYCLBenchExperimentType(wgp)

#subset just kernels with a verified passing result.
wgp = wgp[wgp['Verification'] == "PASS"]
#discard all but the largest problem size for DagTaskThroughput kernels
wgp_sub = wgp[ (~ wgp['Benchmark name'].str.contains('DAGTaskThroughput')) | (wgp['Benchmark name'].str.contains('DAGTaskThroughput') & (wgp['problem-size'] == 65536))]
In [243]:
%%R -i wgp -i wgp_sub -i colour_scale -w 10 -h 10 --units in -r 200

library('ggplot2')
library('cowplot')
names(wgp) <- make.names(names(wgp), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(wgp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot() + colour_scale

p1
In [244]:
#subset just kernels with a verified passing result.
wgp = wgp[wgp['Verification'] == "PASS"]

wgp['Benchmark name'].unique()
Out[244]:
array(['IndependentDAGTaskThroughput_NDRangeParallelFor',
       'LocalMem_int32_4096', 'LocalMem_fp32_4096', 'LocalMem_fp64_4096',
       'SegmentedReduction_NDRange_int16',
       'SegmentedReduction_NDRange_int32',
       'SegmentedReduction_NDRange_int64',
       'SegmentedReduction_NDRange_fp32',
       'SegmentedReduction_NDRange_fp64', 'LinearRegressionCoeff_fp32',
       'LinearRegressionCoeff_fp64', 'ScalarProduct_NDRange_int32',
       'ScalarProduct_NDRange_int64', 'ScalarProduct_NDRange_fp32',
       'ScalarProduct_NDRange_fp64',
       'DAGTaskThroughput_NDRangeParallelFor', 'NBody_NDRange_fp64',
       'Reduction_NDRange_int32', 'Reduction_NDRange_int64',
       'Reduction_NDRange_fp64', 'NBody_NDRange_fp32'], dtype=object)

All datum, except for DAGTask applications, can be presented according to their data-type so this seems like a logical way to break-down and interpret the data.

WGP Divided by Data-Type

Float

In [245]:
#subset just kernels with a verified passing result.
wgp = wgp[wgp['Verification'] == "PASS"]

#we could divide this data-set either by the number of invocations used in application or by the data-type worked with -- we choose the latter.
wgp_fp = wgp[wgp['Benchmark name'].str.contains("_fp")]
wgp_fp['data.type.width'] = wgp_fp['Benchmark name'].str.extract(r'_fp(\d+)')
wgp_fp['Benchmark name'] = wgp_fp['Benchmark name'].str.replace(r'_fp(\d+)','')
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
In [246]:
%%R -i wgp_fp -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(wgp_fp) <- make.names(names(wgp_fp), unique = FALSE, allow_ = TRUE)
wgp_fp$data.type.width <- reorder(wgp_fp$data.type.width, as.numeric(wgp_fp$data.type.width))
#rename for plotting
levels(wgp_fp$data.type.width) <- paste("float",levels(wgp_fp$data.type.width),sep='')

p1 <- ggplot(wgp_fp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ data.type.width, strip.position = "top", scales = "free_x") + colour_scale
p2 <- ggplot(wgp_fp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ data.type.width, strip.position = "top", scales = "free_x") + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)

Int

In [247]:
#subset just kernels with a verified passing result.
wgp = wgp[wgp['Verification'] == "PASS"]

#we could divide this data-set either by the number of invocations used in application or by the data-type worked with -- we choose the latter.
wgp_int = wgp[wgp['Benchmark name'].str.contains("_int")]
wgp_int['data.type.width'] = wgp_int['Benchmark name'].str.extract(r'_int(\d+)')
wgp_int['Benchmark name'] = wgp_int['Benchmark name'].str.replace(r'_int(\d+)','')
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
In [248]:
%%R -i wgp_int -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(wgp_int) <- make.names(names(wgp_int), unique = FALSE, allow_ = TRUE)
wgp_int$data.type.width <- reorder(wgp_int$data.type.width, as.numeric(wgp_int$data.type.width))
#rename for plotting
levels(wgp_int$data.type.width) <- paste("int",levels(wgp_int$data.type.width),sep='')

p1 <- ggplot(wgp_int, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ data.type.width, strip.position = "top", scales = "free_x") + colour_scale
p2 <- ggplot(wgp_int, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ data.type.width, strip.position = "top", scales = "free_x") + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
In [249]:
%%R -i wgp_int -i colour_scale -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(wgp_int) <- make.names(names(wgp_int), unique = FALSE, allow_ = TRUE)
wgp_int$data.type.width <- reorder(wgp_int$data.type.width, as.numeric(wgp_int$data.type.width))
wgp_int <- subset(wgp_int, data.type.width=="32")
#rename for plotting
levels(wgp_int$data.type.width) <- paste("int",levels(wgp_int$data.type.width),sep='')

p1 <- ggplot(wgp_int, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('wgp-int.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)
In [245]:
#subset just kernels with a verified passing result.
wgp = wgp[wgp['Verification'] == "PASS"]

#we could divide this data-set either by the number of invocations used in application or by the data-type worked with -- we choose the latter.
wgp_fp = wgp[wgp['Benchmark name'].str.contains("_fp")]
wgp_fp['data.type.width'] = wgp_fp['Benchmark name'].str.extract(r'_fp(\d+)')
wgp_fp['Benchmark name'] = wgp_fp['Benchmark name'].str.replace(r'_fp(\d+)','')
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
In [271]:
%%R -i wgp_fp -i colour_scale -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(wgp_fp) <- make.names(names(wgp_fp), unique = FALSE, allow_ = TRUE)
wgp_fp$data.type.width <- reorder(wgp_fp$data.type.width, as.numeric(wgp_fp$data.type.width))
wgp_fp <- subset(wgp_fp, data.type.width=="32")
#rename for plotting
levels(wgp_fp$data.type.width) <- paste("fp",levels(wgp_fp$data.type.width),sep='')

p1 <- ggplot(wgp_fp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('wgp-float.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)

DAGTask

Let's plot the last couple of kernels without explicit data-types:

Runtime_IndependentDAGTaskThroughput_NDRangeParallelFor and Runtime_DAGTaskThroughput_NDRangeParallelFor.

In [250]:
#subset just kernels with a verified passing result.
wgp = wgp[wgp['Verification'] == "PASS"]

#we could divide this data-set either by the number of invocations used in application or by the data-type worked with -- we choose the latter.
wgp_dag = wgp[wgp['Benchmark name'].str.contains("DAGTaskThroughput")]
In [251]:
%%R -i wgp_dag -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(wgp_dag) <- make.names(names(wgp_dag), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(wgp_dag, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(wgp_dag, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)

From the box-and-whisker plots, we notice there is a large variation in performance on most SYCL Runtimes --with exceptions on the Xeon Gold ComputeCpp/CPU and Vega 20 hipSYCL/ROCm-- and merits additional investigation.

In [252]:
wgp_dag['problem-size'].unique()
Out[252]:
array([ 1024.,  2048.,  4096.,  8192., 16384., 32768., 65536.])

Eureka! The data is amalgamated over several different problem sizes. Let's present these results separately.

In [253]:
%%R -i wgp_dag -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(wgp_dag) <- make.names(names(wgp_dag), unique = FALSE, allow_ = TRUE)

wgp_dag$problem.size <- reorder(wgp_dag$problem.size, as.numeric(wgp_dag$problem.size))

#rename for plotting
levels(wgp_dag$problem.size) <- paste("Size: ",levels(wgp_dag$problem.size))

p1 <- ggplot(wgp_dag, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ problem.size, strip.position = "top", scales = "free_x") + colour_scale
p1
#p2 <- ggplot(wgp_dag, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ problem.size, strip.position = "top", scales = "free_x") + colour_scale

#library('cowplot')
#pg <- plot_grid(p1 + theme(legend.position="none"),
#                p2 + theme(legend.position="none"),
#                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
#legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
#plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
In [254]:
%%R -i wgp_dag -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(wgp_dag) <- make.names(names(wgp_dag), unique = FALSE, allow_ = TRUE)

wgp_dag$problem.size <- reorder(wgp_dag$problem.size, as.numeric(wgp_dag$problem.size))

#rename for plotting
levels(wgp_dag$problem.size) <- paste("Size: ",levels(wgp_dag$problem.size))

p2 <- ggplot(wgp_dag, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ problem.size, strip.position = "top", scales = "free_x") + colour_scale
p2
#library('cowplot')
#pg <- plot_grid(p1 + theme(legend.position="none"),
#                p2 + theme(legend.position="none"),
#                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
#legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
#plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)

As the problem size increases, we see the overall magnitudes of execution time also increase. This is most apparent when examining the hipSYCL/OpenMP runtime on the Xeon Gold -- especially on the TaskThroughput kernel. However the relative ordering remains the same. With the DPC++/CPU runtime performing the best on all problem sizes, followed by the ComputeCpp/OpenCL runtime, then the CUDA backends performing equally regardless of wether hipSYCL or DPC++ is used as the implementation. Since we have the data, we subset the data by solely selecting the largest problem size (65536 or 2^16) whenever these DAG kernels are presented.

In [257]:
#subset just kernels with a verified passing result.
wgp = wgp[wgp['Verification'] == "PASS"]

wgp_sub_dag = wgp[wgp['Benchmark name'].str.contains("DAGTaskThroughput")]
#select just the largest problem size
wgp_sub_dag = wgp_sub_dag[wgp_sub_dag['problem-size'] == 65536]
#rename/shorten names
wgp_sub_dag['Benchmark name'] = wgp_sub_dag['Benchmark name'].str.extract(r'(\w+)_')
In [258]:
%%R -i wgp_sub_dag -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(wgp_sub_dag) <- make.names(names(wgp_sub_dag), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(wgp_sub_dag, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(wgp_sub_dag, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)

HDP – Hierarchical data-parallel kernels

SYCL offers compiler level support –doesn’t change the underlying execution model of the kernel– for expressing the hierarchical nature of data-parallelism –such as work-groups– and can be used for performance tuning. A range is provided into the enqueuing functions to specify the number of work-groups to launch and an optional size of each work-group.

parallel_for_work_item: Use of this function in the suite indicates there has been an attempt made to optimize the application to use private memory. This corresponds to the lowest level cache / smallest-faster memory on the accelerator. 6 of the 37 applications examined make use of enqueuing via parallel_for_work_item, namely,dag_task_throughput_independent (1), dag_task_throughput_sequential (1), nbody (4), scalar_prod (4), segmentedreduction (3) and reduction (3).

parallel_for_work_group: Presents a degree of optimization around the use of local memory, because all variables declared in this scope are allocated in workgroup local memory. The same applications that use parallel_for_work_item also use parallel_for_work_group. The number of times they are used differ; dag_task_throughput_independent (1), dag_task_throughput_sequential (1), n-body (1), scalar_prod (2), segmentedreduction (1) and reduction (1).

In [259]:
hdp = chompLeadingSYCLBenchExperimentType(hdp)
#subset just kernels with a verified passing result.
hdp = hdp[hdp['Verification'] == "PASS"]
#discard all but the largest problem size for DagTaskThroughput kernels
hdp = hdp[ (~ hdp['Benchmark name'].str.contains('DAGTaskThroughput')) | (hdp['Benchmark name'].str.contains('DAGTaskThroughput') & (hdp['problem-size'] == 65536))]
In [260]:
%%R -i hdp -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(hdp) <- make.names(names(hdp), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(hdp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(hdp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
In [263]:
%%R -i hdp -i colour_scale -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(hdp) <- make.names(names(hdp), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(hdp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 50, hjust = 1, size = 5)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('hdp-dt.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)

Task – Single-Task Kernels

A kernel is executed once, conceptually, on a single compute-unit, in one work-group, as one work-item; these kernels can be executed on multiple devices and queues and encompass task-based parallelism. It is used with the single_task function. In the suite, 3 applications use this construct; dag_task_throughput_sequential (1), dag_task_throughput_independent (1) and host_device_bandwidth (1). However, host_device_bandwidth submits a non-operation single_task to force a read-only buffer to be copied in the micro-benchmark, since this kernel does nowork it is omitted from the evaluation.

In [264]:
task = chompLeadingSYCLBenchExperimentType(task)

#subset just kernels with a verified passing result.
task = task[task['Verification'] == "PASS"]
In [265]:
%%R -i task -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(task) <- make.names(names(task), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(task, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(task, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
In [266]:
%%R -i task -i colour_scale -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(task) <- make.names(names(task), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(task, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('task.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)

Sync – Synchronization

In general, operations between the host and the device(s) will require synchronization; such as buffer destruction, host accessors, command group enqueue, queue operations etc. Instead, we focus on user-controllable synchronization events: those that occur within a kernels execution, either globally or locally – within a work-group. The barrier function is used inside kernels to synchronize between work-items in a work-group. It is used in 7 of the 37 kernels, namely, reduction (1), segmentedreduction (1), lin_reg_coeff (2), scalar_prod (2), nbody (2), local_mem (2). lin_reg_coeff, scalar_prod and local_mem request a local_space fence synchronization within a work-group whereas reduction, segmented_reduction and nbody use the default global barrier. NDRange versions of these kernels are the ones which contain barriers – the hierarchical variations do not. The nbody kernel contains two barriers in the same invocation, as does the local_mem benchmark. Reduction contains one barrier in the inner-most loop of the NDRange implementation.

In [267]:
sync = chompLeadingSYCLBenchExperimentType(sync)

#subset just kernels with a verified passing result.
sync = sync[sync['Verification'] == "PASS"]
In [268]:
%%R -i sync -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(sync) <- make.names(names(sync), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(sync, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(sync, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
                p2 + theme(legend.position="none"),
                align = 'vh', hjust = -2, nrow = 2)

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
In [269]:
%%R -i sync -i colour_scale -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(sync) <- make.names(names(sync), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(sync, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

ggsave('sync.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)

Generating Results

This is a good opportunity to record how the results were compiled and collected.

ComputeCPP

Compile with ComputeCPP and remove non-applications from the final build.

In [33]:
! rm -r ./computecpp-benchmarks
! mkdir ./computecpp-benchmarks && cd ./computecpp-benchmarks && cmake ../.. -DSYCL_IMPL=ComputeCpp -DCMAKE_PREFIX_PATH=/tmp/ComputeCpp-latest && make -j16 && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
-- The C compiler identification is Clang 9.0.1
-- The CXX compiler identification is Clang 9.0.1
-- Check for working C compiler: /llvm-9.0.1/bin/clang
-- Check for working C compiler: /llvm-9.0.1/bin/clang -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Looking for CL_VERSION_2_2
-- Looking for CL_VERSION_2_2 - found
-- Found OpenCL: /usr/lib/x86_64-linux-gnu/libOpenCL.so (found version "2.2") 
-- platform - your system can support ComputeCpp
-- Found ComputeCpp: /tmp/ComputeCpp-latest (found version "CE 1.3.0") 
-- compute++ flags - -O2;-mllvm;-inline-threshold=1000;-intelspirmetadata;-sycl-target;spir64
-- Configuring done
-- Generating done
-- Build files have been written to: /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks
Scanning dependencies of target mvt_mvt.cpp_0_ih
Scanning dependencies of target gesummv_gesummv.cpp_0_ih
Scanning dependencies of target 3mm_3mm.cpp_0_ih
Scanning dependencies of target gemm_gemm.cpp_0_ih
Scanning dependencies of target gramschmidt_gramschmidt.cpp_0_ih
Scanning dependencies of target fdtd2d_fdtd2d.cpp_0_ih
Scanning dependencies of target 3DConvolution_3DConvolution.cpp_0_ih
Scanning dependencies of target correlation_correlation.cpp_0_ih
Scanning dependencies of target 2DConvolution_2DConvolution.cpp_0_ih
Scanning dependencies of target sobel_sobel.cpp_0_ih
Scanning dependencies of target 2mm_2mm.cpp_0_ih
Scanning dependencies of target sobel7_sobel7.cpp_0_ih
Scanning dependencies of target pattern_L2_pattern_L2.cpp_0_ih
Scanning dependencies of target sf_sf.cpp_0_ih
Scanning dependencies of target scalar_prod_scalar_prod.cpp_0_ih
Scanning dependencies of target sobel5_sobel5.cpp_0_ih
[  2%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/gesummv_gesummv.cpp.sycl
[  2%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/mvt_mvt.cpp.sycl
[  3%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/gramschmidt_gramschmidt.cpp.sycl
[  4%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/3mm_3mm.cpp.sycl
[  5%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/gemm_gemm.cpp.sycl
[  5%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/3DConvolution_3DConvolution.cpp.sycl
[  6%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/sobel_sobel.cpp.sycl
[  7%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/correlation_correlation.cpp.sycl
[  8%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/2mm_2mm.cpp.sycl
[  8%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/scalar_prod_scalar_prod.cpp.sycl
[ 10%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/2DConvolution_2DConvolution.cpp.sycl
[ 10%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/sobel7_sobel7.cpp.sycl
[ 12%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/sobel5_sobel5.cpp.sycl
[ 12%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/fdtd2d_fdtd2d.cpp.sycl
[ 13%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/pattern_L2_pattern_L2.cpp.sycl
[ 14%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/sf_sf.cpp.sycl
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 14%] Built target gemm_gemm.cpp_0_ih
Scanning dependencies of target nbody_nbody.cpp_0_ih
[ 15%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/nbody_nbody.cpp.sycl
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 15%] Built target 3DConvolution_3DConvolution.cpp_0_ih
[ 15%] Built target 2mm_2mm.cpp_0_ih
Scanning dependencies of target median_median.cpp_0_ih
Scanning dependencies of target arith_arith.cpp_0_ih
[ 16%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/median_median.cpp.sycl
[ 17%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/arith_arith.cpp.sycl
[ 17%] Built target 2DConvolution_2DConvolution.cpp_0_ih
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
Scanning dependencies of target local_mem_local_mem.cpp_0_ih
[ 17%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/local_mem_local_mem.cpp.sycl
[ 17%] Built target gesummv_gesummv.cpp_0_ih
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
Scanning dependencies of target DRAM_DRAM.cpp_0_ih
[ 18%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/DRAM_DRAM.cpp.sycl
[ 18%] Built target sobel5_sobel5.cpp_0_ih
Scanning dependencies of target covariance_covariance.cpp_0_ih
[ 18%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/covariance_covariance.cpp.sycl
[ 18%] Built target fdtd2d_fdtd2d.cpp_0_ih
[ 18%] Built target 3mm_3mm.cpp_0_ih
Scanning dependencies of target reduction_reduction.cpp_0_ih
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
Scanning dependencies of target host_device_bandwidth_host_device_bandwidth.cpp_0_ih
[ 19%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/reduction_reduction.cpp.sycl
[ 20%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/host_device_bandwidth_host_device_bandwidth.cpp.sycl
[ 20%] Built target sf_sf.cpp_0_ih
Scanning dependencies of target lin_reg_error_lin_reg_error.cpp_0_ih
[ 21%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/lin_reg_error_lin_reg_error.cpp.sycl
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 21%] Built target sobel7_sobel7.cpp_0_ih
[ 21%] Built target correlation_correlation.cpp_0_ih
Scanning dependencies of target kmeans_kmeans.cpp_0_ih
Scanning dependencies of target atax_atax.cpp_0_ih
[ 22%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/kmeans_kmeans.cpp.sycl
[ 23%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/atax_atax.cpp.sycl
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 23%] Built target mvt_mvt.cpp_0_ih
Scanning dependencies of target segmentedreduction_segmentedreduction.cpp_0_ih
[ 24%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/segmentedreduction_segmentedreduction.cpp.sycl
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 24%] Built target pattern_L2_pattern_L2.cpp_0_ih
Scanning dependencies of target mol_dyn_mol_dyn.cpp_0_ih
[ 25%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/mol_dyn_mol_dyn.cpp.sycl
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 25%] Built target sobel_sobel.cpp_0_ih
Scanning dependencies of target syrk_syrk.cpp_0_ih
[ 26%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/syrk_syrk.cpp.sycl
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 26%] Built target gramschmidt_gramschmidt.cpp_0_ih
Scanning dependencies of target syr2k_syr2k.cpp_0_ih
[ 27%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/syr2k_syr2k.cpp.sycl
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 27%] Built target scalar_prod_scalar_prod.cpp_0_ih
Scanning dependencies of target bicg_bicg.cpp_0_ih
[ 28%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/bicg_bicg.cpp.sycl
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 28%] Built target covariance_covariance.cpp_0_ih
Scanning dependencies of target lin_reg_coeff_lin_reg_coeff.cpp_0_ih
[ 29%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/lin_reg_coeff_lin_reg_coeff.cpp.sycl
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 29%] Built target lin_reg_error_lin_reg_error.cpp_0_ih
Scanning dependencies of target dag_task_throughput_sequential_dag_task_throughput_sequential.cpp_0_ih
[ 30%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/dag_task_throughput_sequential_dag_task_throughput_sequential.cpp.sycl
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 30%] Built target atax_atax.cpp_0_ih
Scanning dependencies of target vec_add_vec_add.cpp_0_ih
[ 30%] Built target arith_arith.cpp_0_ih
[ 31%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/vec_add_vec_add.cpp.sycl
Scanning dependencies of target dag_task_throughput_independent_dag_task_throughput_independent.cpp_0_ih
[ 32%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/dag_task_throughput_independent_dag_task_throughput_independent.cpp.sycl
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 32%] Built target local_mem_local_mem.cpp_0_ih
Scanning dependencies of target blocked_transform_blocked_transform.cpp_0_ih
[ 33%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/blocked_transform_blocked_transform.cpp.sycl
[ 33%] Built target host_device_bandwidth_host_device_bandwidth.cpp_0_ih
Scanning dependencies of target matmulchain_matmulchain.cpp_0_ih
[ 34%] Building ComputeCpp integration header file /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/matmulchain_matmulchain.cpp.sycl
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 34%] Built target kmeans_kmeans.cpp_0_ih
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
Scanning dependencies of target mvt
[ 34%] Built target DRAM_DRAM.cpp_0_ih
Scanning dependencies of target gramschmidt
[ 35%] Building CXX object CMakeFiles/mvt.dir/polybench/mvt.cpp.o
[ 35%] Building CXX object CMakeFiles/gramschmidt.dir/polybench/gramschmidt.cpp.o
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 35%] Built target mol_dyn_mol_dyn.cpp_0_ih
Scanning dependencies of target gesummv
[ 36%] Building CXX object CMakeFiles/gesummv.dir/polybench/gesummv.cpp.o
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 36%] Built target median_median.cpp_0_ih
Scanning dependencies of target gemm
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 36%] Built target syrk_syrk.cpp_0_ih
Scanning dependencies of target fdtd2d
[ 37%] Building CXX object CMakeFiles/gemm.dir/polybench/gemm.cpp.o
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 37%] Built target nbody_nbody.cpp_0_ih
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
Scanning dependencies of target correlation
[ 38%] Building CXX object CMakeFiles/fdtd2d.dir/polybench/fdtd2d.cpp.o
[ 38%] Built target reduction_reduction.cpp_0_ih
Scanning dependencies of target 3mm
[ 39%] Building CXX object CMakeFiles/correlation.dir/polybench/correlation.cpp.o
[ 40%] Building CXX object CMakeFiles/3mm.dir/polybench/3mm.cpp.o
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 40%] Built target syr2k_syr2k.cpp_0_ih
Scanning dependencies of target 3DConvolution
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 41%] Building CXX object CMakeFiles/3DConvolution.dir/polybench/3DConvolution.cpp.o
[ 41%] Built target bicg_bicg.cpp_0_ih
Scanning dependencies of target 2mm
[ 42%] Building CXX object CMakeFiles/2mm.dir/polybench/2mm.cpp.o
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 42%] Built target segmentedreduction_segmentedreduction.cpp_0_ih
Scanning dependencies of target 2DConvolution
[ 42%] Building CXX object CMakeFiles/2DConvolution.dir/polybench/2DConvolution.cpp.o
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 42%] Built target dag_task_throughput_sequential_dag_task_throughput_sequential.cpp_0_ih
[ 42%] Built target lin_reg_coeff_lin_reg_coeff.cpp_0_ih
Scanning dependencies of target sobel7
Scanning dependencies of target sobel
[ 42%] Built target matmulchain_matmulchain.cpp_0_ih
Scanning dependencies of target pattern_L2
[ 43%] Building CXX object CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 44%] Building CXX object CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o
[ 44%] Building CXX object CMakeFiles/pattern_L2.dir/micro/pattern_L2.cpp.o
[ 44%] Built target vec_add_vec_add.cpp_0_ih
Scanning dependencies of target scalar_prod
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
remark: [Computecpp:CC0027]: Some memcpy/memset intrinsics added by the llvm
      optimizer were replaced by serial functions. This is a workaround for
      OpenCL drivers that do not support those intrinsics. This may impact
      performance, consider using -no-serial-memop. [-Rsycl-serial-memop]
[ 44%] Built target dag_task_throughput_independent_dag_task_throughput_independent.cpp_0_ih
[ 44%] Built target blocked_transform_blocked_transform.cpp_0_ih
[ 45%] Building CXX object CMakeFiles/scalar_prod.dir/single-kernel/scalar_prod.cpp.o
Scanning dependencies of target sobel5
Scanning dependencies of target sf
[ 46%] Building CXX object CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o
[ 47%] Building CXX object CMakeFiles/sf.dir/micro/sf.cpp.o
[ 48%] Linking CXX executable gemm
[ 49%] Linking CXX executable gesummv
[ 50%] Linking CXX executable mvt
[ 51%] Linking CXX executable gramschmidt
[ 51%] Built target gemm
Scanning dependencies of target nbody
[ 51%] Built target gesummv
Scanning dependencies of target median
[ 51%] Built target mvt
Scanning dependencies of target arith
[ 52%] Building CXX object CMakeFiles/median.dir/single-kernel/median.cpp.o
[ 52%] Built target gramschmidt
Scanning dependencies of target local_mem
[ 53%] Building CXX object CMakeFiles/arith.dir/micro/arith.cpp.o
[ 54%] Building CXX object CMakeFiles/nbody.dir/single-kernel/nbody.cpp.o
[ 55%] Building CXX object CMakeFiles/local_mem.dir/micro/local_mem.cpp.o
[ 56%] Linking CXX executable 3DConvolution
[ 56%] Built target 3DConvolution
Scanning dependencies of target DRAM
[ 57%] Building CXX object CMakeFiles/DRAM.dir/micro/DRAM.cpp.o
[ 57%] Linking CXX executable fdtd2d
[ 58%] Linking CXX executable 2mm
[ 58%] Built target fdtd2d
Scanning dependencies of target covariance
[ 58%] Built target 2mm
[ 59%] Building CXX object CMakeFiles/covariance.dir/polybench/covariance.cpp.o
Scanning dependencies of target reduction
[ 60%] Linking CXX executable 2DConvolution
[ 61%] Linking CXX executable correlation
[ 62%] Building CXX object CMakeFiles/reduction.dir/pattern/reduction.cpp.o
[ 63%] Linking CXX executable 3mm
[ 63%] Built target 2DConvolution
Scanning dependencies of target host_device_bandwidth
[ 63%] Built target correlation
Scanning dependencies of target lin_reg_error
[ 63%] Built target 3mm
Scanning dependencies of target kmeans
[ 64%] Building CXX object CMakeFiles/host_device_bandwidth.dir/micro/host_device_bandwidth.cpp.o
[ 65%] Building CXX object CMakeFiles/lin_reg_error.dir/single-kernel/lin_reg_error.cpp.o
[ 66%] Building CXX object CMakeFiles/kmeans.dir/single-kernel/kmeans.cpp.o
[ 66%] Linking CXX executable sobel
[ 67%] Linking CXX executable sobel7
[ 67%] Built target sobel
Scanning dependencies of target atax
[ 68%] Building CXX object CMakeFiles/atax.dir/polybench/atax.cpp.o
[ 68%] Built target sobel7
Scanning dependencies of target segmentedreduction
[ 69%] Building CXX object CMakeFiles/segmentedreduction.dir/pattern/segmentedreduction.cpp.o
[ 70%] Linking CXX executable sf
[ 71%] Linking CXX executable sobel5
[ 71%] Built target sf
Scanning dependencies of target mol_dyn
[ 72%] Building CXX object CMakeFiles/mol_dyn.dir/single-kernel/mol_dyn.cpp.o
[ 72%] Built target sobel5
Scanning dependencies of target syrk
[ 73%] Building CXX object CMakeFiles/syrk.dir/polybench/syrk.cpp.o
[ 74%] Linking CXX executable median
[ 74%] Built target median
Scanning dependencies of target syr2k
[ 74%] Building CXX object CMakeFiles/syr2k.dir/polybench/syr2k.cpp.o
[ 75%] Linking CXX executable covariance
[ 75%] Built target covariance
Scanning dependencies of target bicg
[ 76%] Building CXX object CMakeFiles/bicg.dir/polybench/bicg.cpp.o
[ 77%] Linking CXX executable lin_reg_error
[ 77%] Built target lin_reg_error
Scanning dependencies of target lin_reg_coeff
[ 77%] Building CXX object CMakeFiles/lin_reg_coeff.dir/single-kernel/lin_reg_coeff.cpp.o
[ 78%] Linking CXX executable kmeans
[ 78%] Built target kmeans
Scanning dependencies of target dag_task_throughput_sequential
[ 78%] Linking CXX executable arith
[ 79%] Building CXX object CMakeFiles/dag_task_throughput_sequential.dir/runtime/dag_task_throughput_sequential.cpp.o
[ 80%] Linking CXX executable local_mem
[ 80%] Built target arith
Scanning dependencies of target vec_add
[ 81%] Building CXX object CMakeFiles/vec_add.dir/single-kernel/vec_add.cpp.o
[ 81%] Built target local_mem
Scanning dependencies of target dag_task_throughput_independent
[ 82%] Building CXX object CMakeFiles/dag_task_throughput_independent.dir/runtime/dag_task_throughput_independent.cpp.o
[ 83%] Linking CXX executable nbody
[ 83%] Built target nbody
Scanning dependencies of target blocked_transform
[ 84%] Linking CXX executable syrk
[ 84%] Building CXX object CMakeFiles/blocked_transform.dir/runtime/blocked_transform.cpp.o
[ 85%] Linking CXX executable scalar_prod
[ 85%] Linking CXX executable mol_dyn
[ 85%] Built target syrk
Scanning dependencies of target matmulchain
[ 86%] Building CXX object CMakeFiles/matmulchain.dir/runtime/matmulchain.cpp.o
[ 87%] Linking CXX executable atax
[ 87%] Built target mol_dyn
[ 87%] Built target scalar_prod
[ 87%] Built target atax
[ 88%] Linking CXX executable DRAM
[ 89%] Linking CXX executable syr2k
[ 89%] Built target DRAM
[ 89%] Built target syr2k
[ 90%] Linking CXX executable pattern_L2
[ 90%] Built target pattern_L2
[ 91%] Linking CXX executable reduction
[ 91%] Built target reduction
[ 92%] Linking CXX executable bicg
[ 92%] Built target bicg
[ 93%] Linking CXX executable host_device_bandwidth
[ 93%] Built target host_device_bandwidth
[ 94%] Linking CXX executable lin_reg_coeff
[ 94%] Built target lin_reg_coeff
[ 95%] Linking CXX executable dag_task_throughput_sequential
[ 95%] Built target dag_task_throughput_sequential
[ 96%] Linking CXX executable dag_task_throughput_independent
[ 96%] Built target dag_task_throughput_independent
[ 97%] Linking CXX executable matmulchain
[ 97%] Built target matmulchain
[ 98%] Linking CXX executable segmentedreduction
[ 98%] Built target segmentedreduction
[ 99%] Linking CXX executable blocked_transform
[ 99%] Built target blocked_transform
[100%] Linking CXX executable vec_add
[100%] Built target vec_add

Update the default symlink for the running suite to operate on.

In [34]:
! rm -r ./benchmarks
! ln -s ./computecpp-benchmarks ./benchmarks

Run the benchmarks on CPU:

In [41]:
! ./run-suite cpu
Using test profile: cpu


##################################################
Processing host_device_bandwidth
##################################################
0.0 10800.0
__________________________________________________

host_device_bandwidth --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 471.2177422650002 s


##################################################
Processing sf
##################################################
0.0 10800.0
__________________________________________________

sf --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 0.4263444839998556 s


##################################################
Processing blocked_transform
##################################################
0.0 10800.0
__________________________________________________

blocked_transform --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
==> Benchmark run finished in 2501.2809642150005 s


##################################################
Processing correlation
##################################################
0.0 10800.0
__________________________________________________

correlation --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 1.1728303609997965 s


##################################################
Processing mvt
##################################################
0.0 10800.0
__________________________________________________

mvt --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=16384 --local=256
==> Benchmark run finished in 119.41664978600056 s


##################################################
Processing arith
##################################################
0.0 10800.0
__________________________________________________

arith --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 0.428975120999894 s


##################################################
Processing lin_reg_error
##################################################
0.0 10800.0
__________________________________________________

lin_reg_error --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=65536 --local=256
==> Benchmark run finished in 494.0880809089995 s


##################################################
Processing 3DConvolution
##################################################
0.0 10800.0
__________________________________________________

3DConvolution --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark FAILED: /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/3DConvolution with args ['--num-runs=50', '--output=./sycl-bench.csv', '--device=cpu', '--size=1024', '--local=256']
Benchmark failed, aborting run


##################################################
Processing gramschmidt
##################################################
0.0 10800.0
__________________________________________________

gramschmidt --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 121.92851670799973 s


##################################################
Processing atax
##################################################
0.0 10800.0
__________________________________________________

atax --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=4096 --local=256
==> Benchmark run finished in 0.5185969840003963 s


##################################################
Processing 2mm
##################################################
0.0 10800.0
__________________________________________________

2mm --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark FAILED: /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/2mm with args ['--num-runs=50', '--output=./sycl-bench.csv', '--device=cpu', '--size=1024', '--local=256']
Benchmark failed, aborting run


##################################################
Processing gesummv
##################################################
0.0 10800.0
__________________________________________________

gesummv --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=16384 --local=256
==> Benchmark run finished in 174.8133164359988 s


##################################################
Processing matmulchain
##################################################
0.0 10800.0
__________________________________________________

matmulchain --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark FAILED: /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/matmulchain with args ['--num-runs=50', '--output=./sycl-bench.csv', '--device=cpu', '--size=1024', '--local=256']
Benchmark failed, aborting run


##################################################
Processing vec_add
##################################################
0.0 10800.0
__________________________________________________

vec_add --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
==> Benchmark run finished in 4.422523392000585 s


##################################################
Processing bicg
##################################################
0.0 10800.0
__________________________________________________

bicg --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=16384 --local=256
==> Benchmark run finished in 3.4472850949987333 s


##################################################
Processing dag_task_throughput_independent
##################################################
0.0 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 18.842967656999463 s
18.842967656999463 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=2048 --local=256
==> Benchmark run finished in 33.84580073500001 s
33.84580073500001 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=4096 --local=256
==> Benchmark run finished in 65.48590637899906 s
65.48590637899906 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=8192 --local=256
==> Benchmark run finished in 174.98742785600007 s
174.98742785600007 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=16384 --local=256
==> Benchmark run finished in 542.0171376829985 s
542.0171376829985 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=32768 --local=256
==> Benchmark run finished in 1918.0627411829992 s
1918.0627411829992 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=65536 --local=256
==> Benchmark run finished in 5381.360100402002 s


##################################################
Processing local_mem
##################################################
0.0 10800.0
__________________________________________________

local_mem --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 0.45480192900140537 s


##################################################
Processing nbody
##################################################
0.0 10800.0
__________________________________________________

nbody --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 8.252817296001012 s


##################################################
Processing gemm
##################################################
0.0 10800.0
__________________________________________________

gemm --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark FAILED: /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/gemm with args ['--num-runs=50', '--output=./sycl-bench.csv', '--device=cpu', '--size=1024', '--local=256']
Benchmark failed, aborting run


##################################################
Processing mol_dyn
##################################################
0.0 10800.0
__________________________________________________

mol_dyn --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
==> Benchmark run finished in 41.13749555400136 s


##################################################
Processing kmeans
##################################################
0.0 10800.0
__________________________________________________

kmeans --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
==> Benchmark run finished in 3.6586995469988324 s


##################################################
Processing pattern_L2
##################################################
0.0 10800.0
__________________________________________________

pattern_L2 --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
==> Benchmark run finished in 4.306664562998776 s


##################################################
Processing 3mm
##################################################
0.0 10800.0
__________________________________________________

3mm --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark FAILED: /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/3mm with args ['--num-runs=50', '--output=./sycl-bench.csv', '--device=cpu', '--size=1024', '--local=256']
Benchmark failed, aborting run


##################################################
Processing syrk
##################################################
0.0 10800.0
__________________________________________________

syrk --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark FAILED: /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/syrk with args ['--num-runs=50', '--output=./sycl-bench.csv', '--device=cpu', '--size=1024', '--local=256']
Benchmark failed, aborting run


##################################################
Processing reduction
##################################################
0.0 10800.0
__________________________________________________

reduction --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
==> Benchmark run finished in 14.006196064001415 s


##################################################
Processing 2DConvolution
##################################################
0.0 10800.0
__________________________________________________

2DConvolution --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=4096 --local=256
==> Benchmark FAILED: /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/2DConvolution with args ['--num-runs=50', '--output=./sycl-bench.csv', '--device=cpu', '--size=4096', '--local=256']
Benchmark failed, aborting run


##################################################
Processing covariance
##################################################
0.0 10800.0
__________________________________________________

covariance --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 1.0505043140001362 s


##################################################
Processing sobel
##################################################
0.0 10800.0
__________________________________________________

sobel --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 24.245486339998024 s


##################################################
Processing segmentedreduction
##################################################
0.0 10800.0
__________________________________________________

segmentedreduction --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 0.5320401429999038 s


##################################################
Processing lin_reg_coeff
##################################################
0.0 10800.0
__________________________________________________

lin_reg_coeff --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
==> Benchmark run finished in 6.515725321998616 s


##################################################
Processing fdtd2d
##################################################
0.0 10800.0
__________________________________________________

fdtd2d --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
1 0: 103802580880621008.000000 72057594037927936.000000 30.582077
==> Benchmark run finished in 4.912492524999834 s


##################################################
Processing scalar_prod
##################################################
0.0 10800.0
__________________________________________________

scalar_prod --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
==> Benchmark run finished in 17.45581575899996 s


##################################################
Processing sobel7
##################################################
0.0 10800.0
__________________________________________________

sobel7 --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark FAILED: /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/sobel7 with args ['--num-runs=50', '--output=./sycl-bench.csv', '--device=cpu', '--size=1024', '--local=256']
Benchmark failed, aborting run


##################################################
Processing syr2k
##################################################
0.0 10800.0
__________________________________________________

syr2k --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark FAILED: /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/syr2k with args ['--num-runs=50', '--output=./sycl-bench.csv', '--device=cpu', '--size=1024', '--local=256']
Benchmark failed, aborting run


##################################################
Processing dag_task_throughput_sequential
##################################################
0.0 10800.0
__________________________________________________

dag_task_throughput_sequential --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 9.003073183001106 s


##################################################
Processing DRAM
##################################################
0.0 10800.0
__________________________________________________

DRAM --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark FAILED: /workspace/codes/sycl-bench/bin/computecpp-kt-benchmarks/DRAM with args ['--num-runs=50', '--output=./sycl-bench.csv', '--device=cpu', '--size=1024', '--local=256']
Benchmark failed, aborting run


##################################################
Processing median
##################################################
0.0 10800.0
__________________________________________________

median --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 24.926173062998714 s


##################################################
Processing sobel5
##################################################
0.0 10800.0
__________________________________________________

sobel5 --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 25.103333571001713 s
The following benchmarks were aborted because they returned a non-zero returncode: 3DConvolution 2mm matmulchain gemm 3mm syrk 2DConvolution sobel7 syr2k DRAM

Rename the results to something more descriptive.

In [42]:
! tail -c +1 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-gold-computecpp.csv

Note: ComputeCPP's support on Nvidia GPUs is expermental -- it uses a PTX backend. Most benchmarks fail to compile with it so it is omitted.

DPC++

Compile with OpenCL, CPU and CUDA/PTX backends.

Note: We expect some benchmarks to fail to build as at the time of investigation (May 2020) SYCL on CUDA in DPC++ had ~60% of features implemented.

Note: We cannot collect individual kernel times with DPC++ because the benchmark suite uses they SYCL profiling info construct which is not supported on host device.

OpenCL
CPU

We must hide the OpenCL ICDs during compilation time to force the CPU backend.

In [56]:
! mv /etc/OpenCL/vendors/amd.icd /etc/OpenCL/vendors/amd.icdX
! mv /etc/OpenCL/vendors/nvidia.icd /etc/OpenCL/vendors/nvidia.icdX
! rm -r ./dpc++-cpu-benchmarks
! mkdir ./dpc++-cpu-benchmarks && cd ./dpc++-cpu-benchmarks && cmake ../.. -DSYCL_IMPL=LLVM -DDPC++_INSTALL_DIR=/tmp/llvm-sycl/build/install && make -j16 --keep-going
! cd ./dpc++-cpu-benchmarks && rm -r Makefile CMakeCache.txt CMakeFiles cmake_install.cmake && cd ..
! mv /etc/OpenCL/vendors/amd.icdX /etc/OpenCL/vendors/amd.icd
! mv /etc/OpenCL/vendors/nvidia.icdX /etc/OpenCL/vendors/nvidia.icd
-- The C compiler identification is Clang 9.0.1
-- The CXX compiler identification is Clang 9.0.1
-- Check for working C compiler: /llvm-9.0.1/bin/clang
-- Check for working C compiler: /llvm-9.0.1/bin/clang -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- SYCL-LLVM sycl-post-link found at: /tmp/llvm-sycl/build/install/bin/sycl-post-link
-- Configuring done
-- Generating done
-- Build files have been written to: /workspace/codes/sycl-bench/bin/dpc++-opencl-kt-benchmarks
Scanning dependencies of target syrk
Scanning dependencies of target gramschmidt
Scanning dependencies of target fdtd2d
Scanning dependencies of target mvt
Scanning dependencies of target 3mm
Scanning dependencies of target syr2k
Scanning dependencies of target covariance
Scanning dependencies of target bicg
Scanning dependencies of target gemm
Scanning dependencies of target median
Scanning dependencies of target atax
Scanning dependencies of target 3DConvolution
Scanning dependencies of target matmulchain
Scanning dependencies of target scalar_prod
Scanning dependencies of target kmeans
Scanning dependencies of target sobel5
[  1%] Building CXX object CMakeFiles/fdtd2d.dir/polybench/fdtd2d.cpp.o
[  2%] Building CXX object CMakeFiles/gramschmidt.dir/polybench/gramschmidt.cpp.o
[  3%] Building CXX object CMakeFiles/atax.dir/polybench/atax.cpp.o
[  5%] Building CXX object CMakeFiles/covariance.dir/polybench/covariance.cpp.o
[  6%] Building CXX object CMakeFiles/median.dir/single-kernel/median.cpp.o
[  7%] Building CXX object CMakeFiles/syr2k.dir/polybench/syr2k.cpp.o
[  9%] Building CXX object CMakeFiles/kmeans.dir/single-kernel/kmeans.cpp.o
[ 10%] Building CXX object CMakeFiles/3mm.dir/polybench/3mm.cpp.o
[ 13%] Building CXX object CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o
[ 13%] Building CXX object CMakeFiles/3DConvolution.dir/polybench/3DConvolution.cpp.o
[ 15%] Building CXX object CMakeFiles/mvt.dir/polybench/mvt.cpp.o
[ 15%] Building CXX object CMakeFiles/gemm.dir/polybench/gemm.cpp.o
[ 17%] Building CXX object CMakeFiles/matmulchain.dir/runtime/matmulchain.cpp.o
[ 19%] Building CXX object CMakeFiles/bicg.dir/polybench/bicg.cpp.o
[ 19%] Building CXX object CMakeFiles/syrk.dir/polybench/syrk.cpp.o
[ 21%] Building CXX object CMakeFiles/scalar_prod.dir/single-kernel/scalar_prod.cpp.o
clang-11: /tmp/llvm-sycl/clang/lib/CodeGen/CGExprAgg.cpp:1862: void clang::CodeGen::CodeGenFunction::EmitAggExpr(const clang::Expr*, clang::CodeGen::AggValueSlot): Assertion `E && hasAggregateEvaluationKind(E->getType()) && "Invalid aggregate expression to emit"' failed.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0.	Program arguments: /tmp/llvm-sycl/build/install/bin/clang-11 -cc1 -triple spir64-unknown-unknown-sycldevice -fsycl -fsycl-is-device -fdeclare-spirv-builtins -aux-triple x86_64-unknown-linux-gnu -Wno-sycl-strict -sycl-std=2017 -emit-llvm-bc -emit-llvm-uselists -disable-free -main-file-name sobel5.cpp -mrelocation-model static -mthread-model posix -mframe-pointer=all -fmath-errno -fno-rounding-math -fno-verbose-asm -mconstructor-aliases -aux-target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /tmp/llvm-sycl/build/install/lib/clang/11.0.0 -internal-isystem /tmp/llvm-sycl/build/install/bin/../include/sycl -D SYCL_BENCH_ENABLE_QUEUE_PROFILING -D __LLVM_SYCL__ -I /workspace/codes/sycl-bench/include -I /workspace/codes/sycl-bench/polybench/common -I /tmp/llvm-sycl/build/install/include/sycl -I /tmp/llvm-sycl/build/install/lib/clang/11.0.0/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /tmp/llvm-sycl/build/install/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -internal-isystem /usr/local/include -internal-isystem /tmp/llvm-sycl/build/install/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -std=c++17 -fdeprecated-macro -fdebug-compilation-dir /workspace/codes/sycl-bench/bin/dpc++-opencl-kt-benchmarks -ferror-limit 19 -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fcolor-diagnostics -vectorize-loops -vectorize-slp -faddrsig -o /tmp/sobel5-8e2245.bc -x c++ /workspace/codes/sycl-bench/single-kernel/sobel5.cpp 
1.	<eof> parser at end of file
2.	Per-file LLVM IR generation
3.	/workspace/codes/sycl-bench/single-kernel/sobel5.cpp:60:9: Generating code for declaration '_ZTS17Sobel5BenchKernel'
 #0 0x000056029d998c2a llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x1ed0c2a)
 #1 0x000056029d996924 llvm::sys::RunSignalHandlers() (/tmp/llvm-sycl/build/install/bin/clang-11+0x1ece924)
 #2 0x000056029d996a73 SignalHandler(int) (/tmp/llvm-sycl/build/install/bin/clang-11+0x1ecea73)
 #3 0x00007f36c9963890 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12890)
 #4 0x00007f36c8614e97 raise (/lib/x86_64-linux-gnu/libc.so.6+0x3ee97)
 #5 0x00007f36c8616801 abort (/lib/x86_64-linux-gnu/libc.so.6+0x40801)
 #6 0x00007f36c860639a (/lib/x86_64-linux-gnu/libc.so.6+0x3039a)
 #7 0x00007f36c8606412 (/lib/x86_64-linux-gnu/libc.so.6+0x30412)
 #8 0x000056029ded3cac clang::CodeGen::CodeGenFunction::EmitAggExpr(clang::Expr const*, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl/build/install/bin/clang-11+0x240bcac)
 #9 0x000056029ded4503 (anonymous namespace)::AggExprEmitter::EmitInitializationToLValue(clang::Expr*, clang::CodeGen::LValue) (/tmp/llvm-sycl/build/install/bin/clang-11+0x240c503)
#10 0x000056029c5edffc (anonymous namespace)::AggExprEmitter::VisitInitListExpr(clang::InitListExpr*) (/tmp/llvm-sycl/build/install/bin/clang-11+0xb25ffc)
#11 0x000056029ded31c3 (anonymous namespace)::AggExprEmitter::Visit(clang::Expr*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x240b1c3)
#12 0x000056029ded3aae clang::CodeGen::CodeGenFunction::EmitAggExpr(clang::Expr const*, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl/build/install/bin/clang-11+0x240baae)
#13 0x000056029de91dc1 clang::CodeGen::CodeGenFunction::EmitExprAsInit(clang::Expr const*, clang::ValueDecl const*, clang::CodeGen::LValue, bool) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23c9dc1)
#14 0x000056029de96aa1 clang::CodeGen::CodeGenFunction::EmitAutoVarInit(clang::CodeGen::CodeGenFunction::AutoVarEmission const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23ceaa1)
#15 0x000056029de9a96a clang::CodeGen::CodeGenFunction::EmitAutoVarDecl(clang::VarDecl const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23d296a)
#16 0x000056029de9ad33 clang::CodeGen::CodeGenFunction::EmitVarDecl(clang::VarDecl const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23d2d33)
#17 0x000056029de9b130 clang::CodeGen::CodeGenFunction::EmitDecl(clang::Decl const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23d3130)
#18 0x000056029dc5623f clang::CodeGen::CodeGenFunction::EmitDeclStmt(clang::DeclStmt const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x218e23f)
#19 0x000056029dc64385 clang::CodeGen::CodeGenFunction::EmitSimpleStmt(clang::Stmt const*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x219c385)
#20 0x000056029dc603e2 clang::CodeGen::CodeGenFunction::EmitStmt(clang::Stmt const*, llvm::ArrayRef<clang::Attr const*>) (/tmp/llvm-sycl/build/install/bin/clang-11+0x21983e2)
#21 0x000056029dc60c2c clang::CodeGen::CodeGenFunction::EmitCompoundStmtWithoutScope(clang::CompoundStmt const&, bool, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2198c2c)
#22 0x000056029dca4307 clang::CodeGen::CodeGenFunction::EmitFunctionBody(clang::Stmt const*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x21dc307)
#23 0x000056029dcb3dd5 clang::CodeGen::CodeGenFunction::GenerateCode(clang::GlobalDecl, llvm::Function*, clang::CodeGen::CGFunctionInfo const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x21ebdd5)
#24 0x000056029dcef735 clang::CodeGen::CodeGenModule::EmitGlobalFunctionDefinition(clang::GlobalDecl, llvm::GlobalValue*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2227735)
#25 0x000056029dced085 clang::CodeGen::CodeGenModule::EmitGlobalDefinition(clang::GlobalDecl, llvm::GlobalValue*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2225085)
#26 0x000056029dcf437f clang::CodeGen::CodeGenModule::EmitDeferred() (/tmp/llvm-sycl/build/install/bin/clang-11+0x222c37f)
#27 0x000056029dcf450c clang::CodeGen::CodeGenModule::Release() (/tmp/llvm-sycl/build/install/bin/clang-11+0x222c50c)
#28 0x000056029e78cc27 (anonymous namespace)::CodeGeneratorImpl::HandleTranslationUnit(clang::ASTContext&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2cc4c27)
#29 0x000056029e78b3e5 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2cc33e5)
#30 0x000056029f0f7b89 clang::ParseAST(clang::Sema&, bool, bool) (/tmp/llvm-sycl/build/install/bin/clang-11+0x362fb89)
#31 0x000056029e78a2a8 clang::CodeGenAction::ExecuteAction() (/tmp/llvm-sycl/build/install/bin/clang-11+0x2cc22a8)
#32 0x000056029e196d89 clang::FrontendAction::Execute() (/tmp/llvm-sycl/build/install/bin/clang-11+0x26ced89)
#33 0x000056029e152ed2 clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x268aed2)
#34 0x000056029e246331 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x277e331)
#35 0x000056029c8ce024 cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/tmp/llvm-sycl/build/install/bin/clang-11+0xe06024)
#36 0x000056029c8ca459 ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) (/tmp/llvm-sycl/build/install/bin/clang-11+0xe02459)
#37 0x000056029c84dbd2 main (/tmp/llvm-sycl/build/install/bin/clang-11+0xd85bd2)
#38 0x00007f36c85f7b97 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x21b97)
#39 0x000056029c8c9fba _start (/tmp/llvm-sycl/build/install/bin/clang-11+0xe01fba)
clang-11: error: unable to execute command: Aborted (core dumped)
clang-11: error: clang frontend command failed due to signal (use -v to see invocation)
clang version 11.0.0 
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /tmp/llvm-sycl/build/install/bin
clang-11: note: diagnostic msg: Error generating preprocessed source(s).
CMakeFiles/sobel5.dir/build.make:62: recipe for target 'CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o' failed
make[2]: *** [CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o] Error 254
make[2]: Target 'CMakeFiles/sobel5.dir/build' not remade because of errors.
CMakeFiles/Makefile2:590: recipe for target 'CMakeFiles/sobel5.dir/all' failed
make[1]: *** [CMakeFiles/sobel5.dir/all] Error 2
Scanning dependencies of target reduction
[ 22%] Building CXX object CMakeFiles/reduction.dir/pattern/reduction.cpp.o
[ 23%] Linking CXX executable syr2k
[ 25%] Linking CXX executable 3DConvolution
[ 26%] Linking CXX executable gemm
[ 26%] Built target syr2k
Scanning dependencies of target DRAM
[ 27%] Linking CXX executable matmulchain
[ 28%] Building CXX object CMakeFiles/DRAM.dir/micro/DRAM.cpp.o
[ 28%] Built target 3DConvolution
Scanning dependencies of target arith
[ 30%] Building CXX object CMakeFiles/arith.dir/micro/arith.cpp.o
[ 30%] Built target gemm
Scanning dependencies of target local_mem
[ 30%] Built target matmulchain
[ 31%] Building CXX object CMakeFiles/local_mem.dir/micro/local_mem.cpp.o
Scanning dependencies of target correlation
[ 32%] Linking CXX executable 3mm
[ 34%] Building CXX object CMakeFiles/correlation.dir/polybench/correlation.cpp.o
[ 35%] Linking CXX executable atax
[ 36%] Linking CXX executable syrk
[ 38%] Linking CXX executable gramschmidt
[ 38%] Built target 3mm
Scanning dependencies of target sobel
[ 38%] Built target atax
[ 38%] Built target syrk
Scanning dependencies of target pattern_L2
Scanning dependencies of target host_device_bandwidth
[ 39%] Building CXX object CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o
[ 40%] Building CXX object CMakeFiles/host_device_bandwidth.dir/micro/host_device_bandwidth.cpp.o
[ 42%] Building CXX object CMakeFiles/pattern_L2.dir/micro/pattern_L2.cpp.o
[ 42%] Built target gramschmidt
Scanning dependencies of target sobel7
[ 43%] Linking CXX executable mvt
[ 44%] Building CXX object CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o
[ 46%] Linking CXX executable bicg
[ 46%] Built target mvt
Scanning dependencies of target 2DConvolution
[ 47%] Linking CXX executable covariance
[ 48%] Building CXX object CMakeFiles/2DConvolution.dir/polybench/2DConvolution.cpp.o
[ 48%] Built target bicg
Scanning dependencies of target vec_add
[ 50%] Building CXX object CMakeFiles/vec_add.dir/single-kernel/vec_add.cpp.o
[ 50%] Built target covariance
Scanning dependencies of target dag_task_throughput_independent
[ 51%] Building CXX object CMakeFiles/dag_task_throughput_independent.dir/runtime/dag_task_throughput_independent.cpp.o
[ 52%] Linking CXX executable fdtd2d
[ 52%] Built target fdtd2d
Scanning dependencies of target blocked_transform
[ 53%] Building CXX object CMakeFiles/blocked_transform.dir/runtime/blocked_transform.cpp.o
[ 55%] Linking CXX executable median
[ 55%] Built target median
Scanning dependencies of target lin_reg_error
[ 56%] Linking CXX executable kmeans
[ 57%] Building CXX object CMakeFiles/lin_reg_error.dir/single-kernel/lin_reg_error.cpp.o
[ 57%] Built target kmeans
Scanning dependencies of target segmentedreduction
[ 59%] Building CXX object CMakeFiles/segmentedreduction.dir/pattern/segmentedreduction.cpp.o
clang-11: /tmp/llvm-sycl/clang/lib/CodeGen/CGExprAgg.cpp:1862: void clang::CodeGen::CodeGenFunction::EmitAggExpr(const clang::Expr*, clang::CodeGen::AggValueSlot): Assertion `E && hasAggregateEvaluationKind(E->getType()) && "Invalid aggregate expression to emit"' failed.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0.	Program arguments: /tmp/llvm-sycl/build/install/bin/clang-11 -cc1 -triple spir64-unknown-unknown-sycldevice -fsycl -fsycl-is-device -fdeclare-spirv-builtins -aux-triple x86_64-unknown-linux-gnu -Wno-sycl-strict -sycl-std=2017 -emit-llvm-bc -emit-llvm-uselists -disable-free -main-file-name sobel.cpp -mrelocation-model static -mthread-model posix -mframe-pointer=all -fmath-errno -fno-rounding-math -fno-verbose-asm -mconstructor-aliases -aux-target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /tmp/llvm-sycl/build/install/lib/clang/11.0.0 -internal-isystem /tmp/llvm-sycl/build/install/bin/../include/sycl -D SYCL_BENCH_ENABLE_QUEUE_PROFILING -D __LLVM_SYCL__ -I /workspace/codes/sycl-bench/include -I /workspace/codes/sycl-bench/polybench/common -I /tmp/llvm-sycl/build/install/include/sycl -I /tmp/llvm-sycl/build/install/lib/clang/11.0.0/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /tmp/llvm-sycl/build/install/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -internal-isystem /usr/local/include -internal-isystem /tmp/llvm-sycl/build/install/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -std=c++17 -fdeprecated-macro -fdebug-compilation-dir /workspace/codes/sycl-bench/bin/dpc++-opencl-kt-benchmarks -ferror-limit 19 -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fcolor-diagnostics -vectorize-loops -vectorize-slp -faddrsig -o /tmp/sobel-7b21d3.bc -x c++ /workspace/codes/sycl-bench/single-kernel/sobel.cpp 
1.	<eof> parser at end of file
2.	Per-file LLVM IR generation
3.	/workspace/codes/sycl-bench/single-kernel/sobel.cpp:50:57: Generating code for declaration '_ZTS16SobelBenchKernel'
 #0 0x000055d46cfbbc2a llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x1ed0c2a)
 #1 0x000055d46cfb9924 llvm::sys::RunSignalHandlers() (/tmp/llvm-sycl/build/install/bin/clang-11+0x1ece924)
 #2 0x000055d46cfb9a73 SignalHandler(int) (/tmp/llvm-sycl/build/install/bin/clang-11+0x1ecea73)
 #3 0x00007f619b2ea890 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12890)
 #4 0x00007f6199f9be97 raise (/lib/x86_64-linux-gnu/libc.so.6+0x3ee97)
 #5 0x00007f6199f9d801 abort (/lib/x86_64-linux-gnu/libc.so.6+0x40801)
 #6 0x00007f6199f8d39a (/lib/x86_64-linux-gnu/libc.so.6+0x3039a)
 #7 0x00007f6199f8d412 (/lib/x86_64-linux-gnu/libc.so.6+0x30412)
 #8 0x000055d46d4f6cac clang::CodeGen::CodeGenFunction::EmitAggExpr(clang::Expr const*, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl/build/install/bin/clang-11+0x240bcac)
 #9 0x000055d46d4f7503 (anonymous namespace)::AggExprEmitter::EmitInitializationToLValue(clang::Expr*, clang::CodeGen::LValue) (/tmp/llvm-sycl/build/install/bin/clang-11+0x240c503)
#10 0x000055d46bc10ffc (anonymous namespace)::AggExprEmitter::VisitInitListExpr(clang::InitListExpr*) (/tmp/llvm-sycl/build/install/bin/clang-11+0xb25ffc)
#11 0x000055d46d4f61c3 (anonymous namespace)::AggExprEmitter::Visit(clang::Expr*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x240b1c3)
#12 0x000055d46d4f6aae clang::CodeGen::CodeGenFunction::EmitAggExpr(clang::Expr const*, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl/build/install/bin/clang-11+0x240baae)
#13 0x000055d46d4b4dc1 clang::CodeGen::CodeGenFunction::EmitExprAsInit(clang::Expr const*, clang::ValueDecl const*, clang::CodeGen::LValue, bool) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23c9dc1)
#14 0x000055d46d4b9aa1 clang::CodeGen::CodeGenFunction::EmitAutoVarInit(clang::CodeGen::CodeGenFunction::AutoVarEmission const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23ceaa1)
#15 0x000055d46d4bd96a clang::CodeGen::CodeGenFunction::EmitAutoVarDecl(clang::VarDecl const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23d296a)
#16 0x000055d46d4bdd33 clang::CodeGen::CodeGenFunction::EmitVarDecl(clang::VarDecl const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23d2d33)
#17 0x000055d46d4be130 clang::CodeGen::CodeGenFunction::EmitDecl(clang::Decl const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23d3130)
#18 0x000055d46d27923f clang::CodeGen::CodeGenFunction::EmitDeclStmt(clang::DeclStmt const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x218e23f)
#19 0x000055d46d287385 clang::CodeGen::CodeGenFunction::EmitSimpleStmt(clang::Stmt const*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x219c385)
#20 0x000055d46d2833e2 clang::CodeGen::CodeGenFunction::EmitStmt(clang::Stmt const*, llvm::ArrayRef<clang::Attr const*>) (/tmp/llvm-sycl/build/install/bin/clang-11+0x21983e2)
#21 0x000055d46d283c2c clang::CodeGen::CodeGenFunction::EmitCompoundStmtWithoutScope(clang::CompoundStmt const&, bool, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2198c2c)
#22 0x000055d46d2c7307 clang::CodeGen::CodeGenFunction::EmitFunctionBody(clang::Stmt const*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x21dc307)
#23 0x000055d46d2d6dd5 clang::CodeGen::CodeGenFunction::GenerateCode(clang::GlobalDecl, llvm::Function*, clang::CodeGen::CGFunctionInfo const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x21ebdd5)
#24 0x000055d46d312735 clang::CodeGen::CodeGenModule::EmitGlobalFunctionDefinition(clang::GlobalDecl, llvm::GlobalValue*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2227735)
#25 0x000055d46d310085 clang::CodeGen::CodeGenModule::EmitGlobalDefinition(clang::GlobalDecl, llvm::GlobalValue*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2225085)
#26 0x000055d46d31737f clang::CodeGen::CodeGenModule::EmitDeferred() (/tmp/llvm-sycl/build/install/bin/clang-11+0x222c37f)
#27 0x000055d46d31750c clang::CodeGen::CodeGenModule::Release() (/tmp/llvm-sycl/build/install/bin/clang-11+0x222c50c)
#28 0x000055d46ddafc27 (anonymous namespace)::CodeGeneratorImpl::HandleTranslationUnit(clang::ASTContext&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2cc4c27)
#29 0x000055d46ddae3e5 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2cc33e5)
#30 0x000055d46e71ab89 clang::ParseAST(clang::Sema&, bool, bool) (/tmp/llvm-sycl/build/install/bin/clang-11+0x362fb89)
#31 0x000055d46ddad2a8 clang::CodeGenAction::ExecuteAction() (/tmp/llvm-sycl/build/install/bin/clang-11+0x2cc22a8)
#32 0x000055d46d7b9d89 clang::FrontendAction::Execute() (/tmp/llvm-sycl/build/install/bin/clang-11+0x26ced89)
#33 0x000055d46d775ed2 clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x268aed2)
#34 0x000055d46d869331 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x277e331)
#35 0x000055d46bef1024 cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/tmp/llvm-sycl/build/install/bin/clang-11+0xe06024)
#36 0x000055d46beed459 ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) (/tmp/llvm-sycl/build/install/bin/clang-11+0xe02459)
#37 0x000055d46be70bd2 main (/tmp/llvm-sycl/build/install/bin/clang-11+0xd85bd2)
#38 0x00007f6199f7eb97 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x21b97)
#39 0x000055d46beecfba _start (/tmp/llvm-sycl/build/install/bin/clang-11+0xe01fba)
clang-11: /tmp/llvm-sycl/clang/lib/CodeGen/CGExprAgg.cpp:1862: void clang::CodeGen::CodeGenFunction::EmitAggExpr(const clang::Expr*, clang::CodeGen::AggValueSlot): Assertion `E && hasAggregateEvaluationKind(E->getType()) && "Invalid aggregate expression to emit"' failed.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0.	Program arguments: /tmp/llvm-sycl/build/install/bin/clang-11 -cc1 -triple spir64-unknown-unknown-sycldevice -fsycl -fsycl-is-device -fdeclare-spirv-builtins -aux-triple x86_64-unknown-linux-gnu -Wno-sycl-strict -sycl-std=2017 -emit-llvm-bc -emit-llvm-uselists -disable-free -main-file-name sobel7.cpp -mrelocation-model static -mthread-model posix -mframe-pointer=all -fmath-errno -fno-rounding-math -fno-verbose-asm -mconstructor-aliases -aux-target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /tmp/llvm-sycl/build/install/lib/clang/11.0.0 -internal-isystem /tmp/llvm-sycl/build/install/bin/../include/sycl -D SYCL_BENCH_ENABLE_QUEUE_PROFILING -D __LLVM_SYCL__ -I /workspace/codes/sycl-bench/include -I /workspace/codes/sycl-bench/polybench/common -I /tmp/llvm-sycl/build/install/include/sycl -I /tmp/llvm-sycl/build/install/lib/clang/11.0.0/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /tmp/llvm-sycl/build/install/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -internal-isystem /usr/local/include -internal-isystem /tmp/llvm-sycl/build/install/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -std=c++17 -fdeprecated-macro -fdebug-compilation-dir /workspace/codes/sycl-bench/bin/dpc++-opencl-kt-benchmarks -ferror-limit 19 -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fcolor-diagnostics -vectorize-loops -vectorize-slp -faddrsig -o /tmp/sobel7-5d2a32.bc -x c++ /workspace/codes/sycl-bench/single-kernel/sobel7.cpp 
1.	<eof> parser at end of file
2.	Per-file LLVM IR generation
3.	/workspace/codes/sycl-bench/single-kernel/sobel7.cpp:52:52: Generating code for declaration '_ZTS17Sobel7BenchKernel'
clang-11: error: unable to execute command: Aborted (core dumped)
clang-11: error: clang frontend command failed due to signal (use -v to see invocation)
clang version 11.0.0 
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /tmp/llvm-sycl/build/install/bin
 #0 0x000055adbb903c2a llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x1ed0c2a)
 #1 0x000055adbb901924 llvm::sys::RunSignalHandlers() (/tmp/llvm-sycl/build/install/bin/clang-11+0x1ece924)
 #2 0x000055adbb901a73 SignalHandler(int) (/tmp/llvm-sycl/build/install/bin/clang-11+0x1ecea73)
 #3 0x00007fcef2bc1890 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12890)
 #4 0x00007fcef1872e97 raise (/lib/x86_64-linux-gnu/libc.so.6+0x3ee97)
 #5 0x00007fcef1874801 abort (/lib/x86_64-linux-gnu/libc.so.6+0x40801)
 #6 0x00007fcef186439a (/lib/x86_64-linux-gnu/libc.so.6+0x3039a)
 #7 0x00007fcef1864412 (/lib/x86_64-linux-gnu/libc.so.6+0x30412)
 #8 0x000055adbbe3ecac clang::CodeGen::CodeGenFunction::EmitAggExpr(clang::Expr const*, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl/build/install/bin/clang-11+0x240bcac)
 #9 0x000055adbbe3f503 (anonymous namespace)::AggExprEmitter::EmitInitializationToLValue(clang::Expr*, clang::CodeGen::LValue) (/tmp/llvm-sycl/build/install/bin/clang-11+0x240c503)
#10 0x000055adba558ffc (anonymous namespace)::AggExprEmitter::VisitInitListExpr(clang::InitListExpr*) (/tmp/llvm-sycl/build/install/bin/clang-11+0xb25ffc)
#11 0x000055adbbe3e1c3 (anonymous namespace)::AggExprEmitter::Visit(clang::Expr*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x240b1c3)
#12 0x000055adbbe3eaae clang::CodeGen::CodeGenFunction::EmitAggExpr(clang::Expr const*, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl/build/install/bin/clang-11+0x240baae)
#13 0x000055adbbdfcdc1 clang::CodeGen::CodeGenFunction::EmitExprAsInit(clang::Expr const*, clang::ValueDecl const*, clang::CodeGen::LValue, bool) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23c9dc1)
#14 0x000055adbbe01aa1 clang::CodeGen::CodeGenFunction::EmitAutoVarInit(clang::CodeGen::CodeGenFunction::AutoVarEmission const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23ceaa1)
#15 0x000055adbbe0596a clang::CodeGen::CodeGenFunction::EmitAutoVarDecl(clang::VarDecl const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23d296a)
#16 0x000055adbbe05d33 clang::CodeGen::CodeGenFunction::EmitVarDecl(clang::VarDecl const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23d2d33)
#17 0x000055adbbe06130 clang::CodeGen::CodeGenFunction::EmitDecl(clang::Decl const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x23d3130)
#18 0x000055adbbbc123f clang::CodeGen::CodeGenFunction::EmitDeclStmt(clang::DeclStmt const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x218e23f)
#19 0x000055adbbbcf385 clang::CodeGen::CodeGenFunction::EmitSimpleStmt(clang::Stmt const*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x219c385)
#20 0x000055adbbbcb3e2 clang::CodeGen::CodeGenFunction::EmitStmt(clang::Stmt const*, llvm::ArrayRef<clang::Attr const*>) (/tmp/llvm-sycl/build/install/bin/clang-11+0x21983e2)
#21 0x000055adbbbcbc2c clang::CodeGen::CodeGenFunction::EmitCompoundStmtWithoutScope(clang::CompoundStmt const&, bool, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2198c2c)
#22 0x000055adbbc0f307 clang::CodeGen::CodeGenFunction::EmitFunctionBody(clang::Stmt const*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x21dc307)
#23 0x000055adbbc1edd5 clang::CodeGen::CodeGenFunction::GenerateCode(clang::GlobalDecl, llvm::Function*, clang::CodeGen::CGFunctionInfo const&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x21ebdd5)
#24 0x000055adbbc5a735 clang::CodeGen::CodeGenModule::EmitGlobalFunctionDefinition(clang::GlobalDecl, llvm::GlobalValue*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2227735)
#25 0x000055adbbc58085 clang::CodeGen::CodeGenModule::EmitGlobalDefinition(clang::GlobalDecl, llvm::GlobalValue*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2225085)
#26 0x000055adbbc5f37f clang::CodeGen::CodeGenModule::EmitDeferred() (/tmp/llvm-sycl/build/install/bin/clang-11+0x222c37f)
#27 0x000055adbbc5f50c clang::CodeGen::CodeGenModule::Release() (/tmp/llvm-sycl/build/install/bin/clang-11+0x222c50c)
#28 0x000055adbc6f7c27 (anonymous namespace)::CodeGeneratorImpl::HandleTranslationUnit(clang::ASTContext&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2cc4c27)
#29 0x000055adbc6f63e5 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x2cc33e5)
#30 0x000055adbd062b89 clang::ParseAST(clang::Sema&, bool, bool) (/tmp/llvm-sycl/build/install/bin/clang-11+0x362fb89)
#31 0x000055adbc6f52a8 clang::CodeGenAction::ExecuteAction() (/tmp/llvm-sycl/build/install/bin/clang-11+0x2cc22a8)
#32 0x000055adbc101d89 clang::FrontendAction::Execute() (/tmp/llvm-sycl/build/install/bin/clang-11+0x26ced89)
#33 0x000055adbc0bded2 clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/tmp/llvm-sycl/build/install/bin/clang-11+0x268aed2)
#34 0x000055adbc1b1331 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/tmp/llvm-sycl/build/install/bin/clang-11+0x277e331)
#35 0x000055adba839024 cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/tmp/llvm-sycl/build/install/bin/clang-11+0xe06024)
#36 0x000055adba835459 ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) (/tmp/llvm-sycl/build/install/bin/clang-11+0xe02459)
#37 0x000055adba7b8bd2 main (/tmp/llvm-sycl/build/install/bin/clang-11+0xd85bd2)
#38 0x00007fcef1855b97 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x21b97)
#39 0x000055adba834fba _start (/tmp/llvm-sycl/build/install/bin/clang-11+0xe01fba)
clang-11: error: unable to execute command: Aborted (core dumped)
clang-11: error: clang frontend command failed due to signal (use -v to see invocation)
clang version 11.0.0 
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /tmp/llvm-sycl/build/install/bin
clang-11: note: diagnostic msg: Error generating preprocessed source(s).
CMakeFiles/sobel.dir/build.make:62: recipe for target 'CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o' failed
make[2]: *** [CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o] Error 254
make[2]: Target 'CMakeFiles/sobel.dir/build' not remade because of errors.
CMakeFiles/Makefile2:849: recipe for target 'CMakeFiles/sobel.dir/all' failed
make[1]: *** [CMakeFiles/sobel.dir/all] Error 2
Scanning dependencies of target lin_reg_coeff
[ 60%] Building CXX object CMakeFiles/lin_reg_coeff.dir/single-kernel/lin_reg_coeff.cpp.o
clang-11: note: diagnostic msg: Error generating preprocessed source(s).
CMakeFiles/sobel7.dir/build.make:62: recipe for target 'CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o' failed
make[2]: *** [CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o] Error 254
make[2]: Target 'CMakeFiles/sobel7.dir/build' not remade because of errors.
CMakeFiles/Makefile2:960: recipe for target 'CMakeFiles/sobel7.dir/all' failed
make[1]: *** [CMakeFiles/sobel7.dir/all] Error 2
Scanning dependencies of target mol_dyn
[ 61%] Building CXX object CMakeFiles/mol_dyn.dir/single-kernel/mol_dyn.cpp.o
[ 63%] Linking CXX executable reduction
[ 64%] Linking CXX executable scalar_prod
[ 64%] Built target reduction
Scanning dependencies of target gesummv
[ 65%] Building CXX object CMakeFiles/gesummv.dir/polybench/gesummv.cpp.o
[ 65%] Built target scalar_prod
Scanning dependencies of target 2mm
[ 67%] Building CXX object CMakeFiles/2mm.dir/polybench/2mm.cpp.o
[ 68%] Linking CXX executable 2DConvolution
[ 68%] Built target 2DConvolution
Scanning dependencies of target sf
[ 69%] Building CXX object CMakeFiles/sf.dir/micro/sf.cpp.o
[ 71%] Linking CXX executable correlation
[ 71%] Built target correlation
Scanning dependencies of target nbody
[ 72%] Building CXX object CMakeFiles/nbody.dir/single-kernel/nbody.cpp.o
[ 73%] Linking CXX executable dag_task_throughput_independent
[ 73%] Built target dag_task_throughput_independent
Scanning dependencies of target dag_task_throughput_sequential
[ 75%] Building CXX object CMakeFiles/dag_task_throughput_sequential.dir/runtime/dag_task_throughput_sequential.cpp.o
[ 76%] Linking CXX executable arith
[ 77%] Linking CXX executable blocked_transform
[ 78%] Linking CXX executable local_mem
[ 78%] Built target arith
[ 78%] Built target blocked_transform
[ 78%] Built target local_mem
[ 80%] Linking CXX executable lin_reg_error
[ 80%] Built target lin_reg_error
[ 81%] Linking CXX executable vec_add
[ 81%] Built target vec_add
[ 82%] Linking CXX executable mol_dyn
[ 82%] Built target mol_dyn
[ 84%] Linking CXX executable lin_reg_coeff
[ 84%] Built target lin_reg_coeff
[ 85%] Linking CXX executable DRAM
[ 85%] Built target DRAM
[ 86%] Linking CXX executable host_device_bandwidth
[ 86%] Built target host_device_bandwidth
[ 88%] Linking CXX executable pattern_L2
[ 89%] Linking CXX executable segmentedreduction
[ 89%] Built target pattern_L2
[ 89%] Built target segmentedreduction
[ 90%] Linking CXX executable 2mm
[ 92%] Linking CXX executable gesummv
[ 92%] Built target 2mm
[ 92%] Built target gesummv
[ 93%] Linking CXX executable sf
[ 93%] Built target sf
[ 94%] Linking CXX executable dag_task_throughput_sequential
[ 94%] Built target dag_task_throughput_sequential
[ 96%] Linking CXX executable nbody
[ 96%] Built target nbody
make[1]: Target 'all' not remade because of errors.
Makefile:129: recipe for target 'all' failed
make: *** [all] Error 2
make: Target 'default_target' not remade because of errors.

Update the default symlink for the running suite to operate on.

In [57]:
! rm -r ./benchmarks
! ln -s ./dpc++-benchmarks ./benchmarks

Run the benchmarks on CPU:

In [59]:
! rm -r ./sycl-bench.csv
! LD_LIBRARY_PATH=${LD_LIBARARY_PATH}:/tmp/llvm-sycl/build/install/lib ./run-suite cpu
rm: cannot remove './sycl-bench.csv': No such file or directory
Using test profile: cpu


##################################################
Processing host_device_bandwidth
##################################################
0.0 10800.0
__________________________________________________

host_device_bandwidth --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 7.996840306994272 s


##################################################
Processing sf
##################################################
0.0 10800.0
__________________________________________________

sf --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.06841107699437998 s


##################################################
Processing blocked_transform
##################################################
0.0 10800.0
__________________________________________________

blocked_transform --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.7253465630055871 s


##################################################
Processing correlation
##################################################
0.0 10800.0
__________________________________________________

correlation --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.05386332700436469 s


##################################################
Processing mvt
##################################################
0.0 10800.0
__________________________________________________

mvt --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=16384 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.7700822850019904 s


##################################################
Processing arith
##################################################
0.0 10800.0
__________________________________________________

arith --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.05822460899071302 s


##################################################
Processing lin_reg_error
##################################################
0.0 10800.0
__________________________________________________

lin_reg_error --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=65536 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.05207790400891099 s


##################################################
Processing 3DConvolution
##################################################
0.0 10800.0
__________________________________________________

3DConvolution --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 6.367260508006439 s


##################################################
Processing gramschmidt
##################################################
0.0 10800.0
__________________________________________________

gramschmidt --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.06740878900745884 s


##################################################
Processing atax
##################################################
0.0 10800.0
__________________________________________________

atax --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=4096 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.1087961730081588 s


##################################################
Processing 2mm
##################################################
0.0 10800.0
__________________________________________________

2mm --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.06541222298983485 s


##################################################
Processing gesummv
##################################################
0.0 10800.0
__________________________________________________

gesummv --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=16384 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 1.3089653370116139 s


##################################################
Processing matmulchain
##################################################
0.0 10800.0
__________________________________________________

matmulchain --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.07680065599561203 s


##################################################
Processing vec_add
##################################################
0.0 10800.0
__________________________________________________

vec_add --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.09199294199061114 s


##################################################
Processing bicg
##################################################
0.0 10800.0
__________________________________________________

bicg --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=16384 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.8208727849996649 s


##################################################
Processing dag_task_throughput_independent
##################################################
0.0 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.05028725299052894 s
0.05028725299052894 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=2048 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.057712604000698775 s
0.057712604000698775 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=4096 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.041197665996151045 s
0.057712604000698775 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=8192 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.05708987099933438 s
0.057712604000698775 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=16384 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.05875822600501124 s
0.05875822600501124 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=32768 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.048430522001581267 s
0.05875822600501124 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=65536 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.04256631201133132 s


##################################################
Processing local_mem
##################################################
0.0 10800.0
__________________________________________________

local_mem --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.05075932899489999 s


##################################################
Processing nbody
##################################################
0.0 10800.0
__________________________________________________

nbody --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.05773723700258415 s


##################################################
Processing gemm
##################################################
0.0 10800.0
__________________________________________________

gemm --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.05876432699733414 s


##################################################
Processing mol_dyn
##################################################
0.0 10800.0
__________________________________________________

mol_dyn --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.32109780599421356 s


##################################################
Processing kmeans
##################################################
0.0 10800.0
__________________________________________________

kmeans --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.08392896200530231 s


##################################################
Processing pattern_L2
##################################################
0.0 10800.0
__________________________________________________

pattern_L2 --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.06638163099705707 s


##################################################
Processing 3mm
##################################################
0.0 10800.0
__________________________________________________

3mm --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.07739037800638471 s


##################################################
Processing syrk
##################################################
0.0 10800.0
__________________________________________________

syrk --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.06403369100007694 s


##################################################
Processing reduction
##################################################
0.0 10800.0
__________________________________________________

reduction --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.08808377500099596 s


##################################################
Processing 2DConvolution
##################################################
0.0 10800.0
__________________________________________________

2DConvolution --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=4096 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.37707202199089807 s


##################################################
Processing covariance
##################################################
0.0 10800.0
__________________________________________________

covariance --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.06423056899802759 s


##################################################
Processing segmentedreduction
##################################################
0.0 10800.0
__________________________________________________

segmentedreduction --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.05069694700068794 s


##################################################
Processing lin_reg_coeff
##################################################
0.0 10800.0
__________________________________________________

lin_reg_coeff --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.08453162398654968 s


##################################################
Processing fdtd2d
##################################################
0.0 10800.0
__________________________________________________

fdtd2d --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.06504186600795947 s


##################################################
Processing scalar_prod
##################################################
0.0 10800.0
__________________________________________________

scalar_prod --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.10940686499816366 s


##################################################
Processing syr2k
##################################################
0.0 10800.0
__________________________________________________

syr2k --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.05790536700806115 s


##################################################
Processing dag_task_throughput_sequential
##################################################
0.0 10800.0
__________________________________________________

dag_task_throughput_sequential --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.04330127299181186 s


##################################################
Processing DRAM
##################################################
0.0 10800.0
__________________________________________________

DRAM --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 9.000205347008887 s


##################################################
Processing median
##################################################
0.0 10800.0
__________________________________________________

median --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: OpenCL API failed. OpenCL API returns: -34 (CL_INVALID_CONTEXT) -34 (CL_INVALID_CONTEXT)
==> Benchmark run finished in 0.48677783699531574 s
All benchmarks were executed successfully

Rename the results to something more descriptive.

In [42]:
! tail -c +1 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-gold-kt-dpc++-cpu.csv
CUDA

Note if running this from within the docker image you may need to rebuild the DPC++ CUDA backend in the running Docker instance -- I believe this may be due to missing CUDA runtime stubs missing during the docker build phase but haven't investigated further. To rebuild the backend:

    cd /tmp
    wget https://github.com/intel/llvm/archive/sycl.zip -O /tmp/llvm-sycl.zip
    unzip /tmp/llvm-sycl.zip -d /tmp
    mv /tmp/llvm-sycl /tmp/llvm-sycl-cuda
    mkdir /tmp/llvm-sycl-cuda/build
    cd /tmp/llvm-sycl-cuda/build
    CC=gcc CXX=g++ CMAKE_LIBRARY_PATH=/usr/local/cuda/lib64/stubs python /tmp/llvm-sycl-cuda/buildbot/configure.py --cuda
    python /tmp/llvm-sycl-cuda/buildbot/compile.py

Build the suite against the DPC++ CUDA backend.

In [60]:
! rm -r ./dpc++-cuda-benchmarks
! mkdir ./dpc++-cuda-benchmarks && cd ./dpc++-cuda-benchmarks && cmake ../.. -DSYCL_IMPL=LLVM-CUDA -DDPC++_INSTALL_DIR=/tmp/llvm-sycl-cuda/build/install && make -j16 --keep-going
! cd ./dpc++-cuda-benchmarks && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
-- The C compiler identification is Clang 9.0.1
-- The CXX compiler identification is Clang 9.0.1
-- Check for working C compiler: /llvm-9.0.1/bin/clang
-- Check for working C compiler: /llvm-9.0.1/bin/clang -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- SYCL-LLVM sycl-post-link found at: /tmp/llvm-sycl-cuda/build/install/bin/sycl-post-link
-- Configuring done
-- Generating done
-- Build files have been written to: /workspace/codes/sycl-bench/bin/dpc++-cuda-kt-benchmarks
Scanning dependencies of target syrk
Scanning dependencies of target mvt
Scanning dependencies of target gramschmidt
Scanning dependencies of target gemm
Scanning dependencies of target covariance
Scanning dependencies of target syr2k
Scanning dependencies of target fdtd2d
Scanning dependencies of target bicg
Scanning dependencies of target 3mm
Scanning dependencies of target median
Scanning dependencies of target atax
Scanning dependencies of target 3DConvolution
Scanning dependencies of target sobel5
Scanning dependencies of target matmulchain
Scanning dependencies of target scalar_prod
Scanning dependencies of target kmeans
[  1%] Building CXX object CMakeFiles/syrk.dir/polybench/syrk.cpp.o
[  2%] Building CXX object CMakeFiles/covariance.dir/polybench/covariance.cpp.o
[  3%] Building CXX object CMakeFiles/3mm.dir/polybench/3mm.cpp.o
[  5%] Building CXX object CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o
[  6%] Building CXX object CMakeFiles/median.dir/single-kernel/median.cpp.o
[  7%] Building CXX object CMakeFiles/kmeans.dir/single-kernel/kmeans.cpp.o
[  9%] Building CXX object CMakeFiles/gemm.dir/polybench/gemm.cpp.o
[ 10%] Building CXX object CMakeFiles/3DConvolution.dir/polybench/3DConvolution.cpp.o
[ 11%] Building CXX object CMakeFiles/gramschmidt.dir/polybench/gramschmidt.cpp.o
[ 13%] Building CXX object CMakeFiles/atax.dir/polybench/atax.cpp.o
[ 14%] Building CXX object CMakeFiles/mvt.dir/polybench/mvt.cpp.o
[ 15%] Building CXX object CMakeFiles/syr2k.dir/polybench/syr2k.cpp.o
[ 18%] Building CXX object CMakeFiles/scalar_prod.dir/single-kernel/scalar_prod.cpp.o
[ 18%] Building CXX object CMakeFiles/matmulchain.dir/runtime/matmulchain.cpp.o
[ 19%] Building CXX object CMakeFiles/bicg.dir/polybench/bicg.cpp.o
[ 21%] Building CXX object CMakeFiles/fdtd2d.dir/polybench/fdtd2d.cpp.o
clang-11: /tmp/llvm-sycl-cuda/clang/lib/CodeGen/CGExprAgg.cpp:1862: void clang::CodeGen::CodeGenFunction::EmitAggExpr(const clang::Expr*, clang::CodeGen::AggValueSlot): Assertion `E && hasAggregateEvaluationKind(E->getType()) && "Invalid aggregate expression to emit"' failed.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0.	Program arguments: /tmp/llvm-sycl-cuda/build/install/bin/clang-11 -cc1 -triple nvptx64-nvidia-cuda-sycldevice -fsycl -fsycl-is-device -fdeclare-spirv-builtins -aux-triple x86_64-unknown-linux-gnu -Wno-sycl-strict -sycl-std=2017 -emit-llvm-bc -emit-llvm-uselists -disable-free -main-file-name sobel5.cpp -mrelocation-model static -mthread-model posix -mframe-pointer=all -fno-rounding-math -fno-verbose-asm -no-integrated-as -aux-target-cpu x86-64 -internal-isystem /tmp/llvm-sycl-cuda/build/install/bin/../include/sycl -mlink-builtin-bitcode /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0/../../clc/libspirv-nvptx64--nvidiacl.bc -mlink-builtin-bitcode /usr/local/cuda-10.1/nvvm/libdevice/libdevice.10.bc -target-feature +ptx64 -target-sdk-version=10.1 -target-cpu sm_30 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0 -internal-isystem /tmp/llvm-sycl-cuda/build/install/bin/../include/sycl -D SYCL_BENCH_ENABLE_QUEUE_PROFILING -D __LLVM_SYCL_CUDA__ -I /workspace/codes/sycl-bench/include -I /workspace/codes/sycl-bench/polybench/common -I /tmp/llvm-sycl-cuda/build/install/include/sycl -I /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -internal-isystem /usr/local/include -internal-isystem /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -std=c++17 -fdeprecated-macro -fno-dwarf-directory-asm -fdebug-compilation-dir /workspace/codes/sycl-bench/bin/dpc++-cuda-kt-benchmarks -ferror-limit 19 -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fcolor-diagnostics -vectorize-loops -vectorize-slp -fsycl-unnamed-lambda -o /tmp/sobel5-05f0a9.bc -x c++ /workspace/codes/sycl-bench/single-kernel/sobel5.cpp 
1.	<eof> parser at end of file
2.	Per-file LLVM IR generation
3.	/workspace/codes/sycl-bench/single-kernel/sobel5.cpp:60:9: Generating code for declaration '_ZTS17Sobel5BenchKernel'
 #0 0x000055e424c50daa llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2034daa)
 #1 0x000055e424c4eaa4 llvm::sys::RunSignalHandlers() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2032aa4)
 #2 0x000055e424c4ebf3 SignalHandler(int) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2032bf3)
 #3 0x00007fb2912db890 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12890)
 #4 0x00007fb28ff8ce97 raise (/lib/x86_64-linux-gnu/libc.so.6+0x3ee97)
 #5 0x00007fb28ff8e801 abort (/lib/x86_64-linux-gnu/libc.so.6+0x40801)
 #6 0x00007fb28ff7e39a (/lib/x86_64-linux-gnu/libc.so.6+0x3039a)
 #7 0x00007fb28ff7e412 (/lib/x86_64-linux-gnu/libc.so.6+0x30412)
 #8 0x000055e425192ecc clang::CodeGen::CodeGenFunction::EmitAggExpr(clang::Expr const*, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2576ecc)
 #9 0x000055e425193723 (anonymous namespace)::AggExprEmitter::EmitInitializationToLValue(clang::Expr*, clang::CodeGen::LValue) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2577723)
#10 0x000055e423778190 (anonymous namespace)::AggExprEmitter::VisitInitListExpr(clang::InitListExpr*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xb5c190)
#11 0x000055e4251923e3 (anonymous namespace)::AggExprEmitter::Visit(clang::Expr*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x25763e3)
#12 0x000055e425192cce clang::CodeGen::CodeGenFunction::EmitAggExpr(clang::Expr const*, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2576cce)
#13 0x000055e425150fe1 clang::CodeGen::CodeGenFunction::EmitExprAsInit(clang::Expr const*, clang::ValueDecl const*, clang::CodeGen::LValue, bool) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2534fe1)
#14 0x000055e425155cc1 clang::CodeGen::CodeGenFunction::EmitAutoVarInit(clang::CodeGen::CodeGenFunction::AutoVarEmission const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2539cc1)
#15 0x000055e425159b8a clang::CodeGen::CodeGenFunction::EmitAutoVarDecl(clang::VarDecl const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x253db8a)
#16 0x000055e425159f53 clang::CodeGen::CodeGenFunction::EmitVarDecl(clang::VarDecl const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x253df53)
#17 0x000055e42515a350 clang::CodeGen::CodeGenFunction::EmitDecl(clang::Decl const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x253e350)
#18 0x000055e424f1545f clang::CodeGen::CodeGenFunction::EmitDeclStmt(clang::DeclStmt const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x22f945f)
#19 0x000055e424f235a5 clang::CodeGen::CodeGenFunction::EmitSimpleStmt(clang::Stmt const*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x23075a5)
#20 0x000055e424f1f602 clang::CodeGen::CodeGenFunction::EmitStmt(clang::Stmt const*, llvm::ArrayRef<clang::Attr const*>) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2303602)
#21 0x000055e424f1fe4c clang::CodeGen::CodeGenFunction::EmitCompoundStmtWithoutScope(clang::CompoundStmt const&, bool, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2303e4c)
#22 0x000055e424f63527 clang::CodeGen::CodeGenFunction::EmitFunctionBody(clang::Stmt const*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2347527)
#23 0x000055e424f72ff5 clang::CodeGen::CodeGenFunction::GenerateCode(clang::GlobalDecl, llvm::Function*, clang::CodeGen::CGFunctionInfo const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2356ff5)
#24 0x000055e424fae955 clang::CodeGen::CodeGenModule::EmitGlobalFunctionDefinition(clang::GlobalDecl, llvm::GlobalValue*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2392955)
#25 0x000055e424fac2a5 clang::CodeGen::CodeGenModule::EmitGlobalDefinition(clang::GlobalDecl, llvm::GlobalValue*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x23902a5)
#26 0x000055e424fb359f clang::CodeGen::CodeGenModule::EmitDeferred() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x239759f)
#27 0x000055e424fb372c clang::CodeGen::CodeGenModule::Release() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x239772c)
#28 0x000055e425a4b397 (anonymous namespace)::CodeGeneratorImpl::HandleTranslationUnit(clang::ASTContext&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2e2f397)
#29 0x000055e425a49b55 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2e2db55)
#30 0x000055e426309029 clang::ParseAST(clang::Sema&, bool, bool) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x36ed029)
#31 0x000055e425a48a18 clang::CodeGenAction::ExecuteAction() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2e2ca18)
#32 0x000055e425455f09 clang::FrontendAction::Execute() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2839f09)
#33 0x000055e425412052 clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x27f6052)
#34 0x000055e4255054b1 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x28e94b1)
#35 0x000055e423a50414 cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xe34414)
#36 0x000055e423a4c839 ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xe30839)
#37 0x000055e4239cf1f2 main (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xdb31f2)
#38 0x00007fb28ff6fb97 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x21b97)
#39 0x000055e423a4c39a _start (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xe3039a)
clang-11: error: unable to execute command: Aborted (core dumped)
clang-11: error: clang frontend command failed due to signal (use -v to see invocation)
clang version 11.0.0 
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /tmp/llvm-sycl-cuda/build/install/bin
clang-11: note: diagnostic msg: Error generating preprocessed source(s).
CMakeFiles/sobel5.dir/build.make:62: recipe for target 'CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o' failed
make[2]: *** [CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o] Error 254
make[2]: Target 'CMakeFiles/sobel5.dir/build' not remade because of errors.
CMakeFiles/Makefile2:590: recipe for target 'CMakeFiles/sobel5.dir/all' failed
make[1]: *** [CMakeFiles/sobel5.dir/all] Error 2
Scanning dependencies of target reduction
[ 22%] Building CXX object CMakeFiles/reduction.dir/pattern/reduction.cpp.o
[ 23%] Linking CXX executable gemm
[ 25%] Linking CXX executable 3DConvolution
[ 26%] Linking CXX executable syr2k
[ 27%] Linking CXX executable syrk
[ 27%] Built target gemm
Scanning dependencies of target DRAM
[ 27%] Built target 3DConvolution
Scanning dependencies of target arith
[ 28%] Building CXX object CMakeFiles/DRAM.dir/micro/DRAM.cpp.o
[ 30%] Building CXX object CMakeFiles/arith.dir/micro/arith.cpp.o
[ 30%] Built target syr2k
Scanning dependencies of target local_mem
[ 31%] Linking CXX executable gramschmidt
[ 32%] Building CXX object CMakeFiles/local_mem.dir/micro/local_mem.cpp.o
[ 32%] Built target syrk
Scanning dependencies of target correlation
[ 34%] Building CXX object CMakeFiles/correlation.dir/polybench/correlation.cpp.o
[ 35%] Linking CXX executable bicg
[ 36%] Linking CXX executable 3mm
[ 36%] Built target gramschmidt
Scanning dependencies of target sobel
[ 38%] Building CXX object CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o
[ 38%] Built target bicg
Scanning dependencies of target pattern_L2
[ 39%] Linking CXX executable fdtd2d
[ 39%] Built target 3mm
[ 40%] Building CXX object CMakeFiles/pattern_L2.dir/micro/pattern_L2.cpp.o
Scanning dependencies of target host_device_bandwidth
[ 42%] Linking CXX executable atax
[ 43%] Building CXX object CMakeFiles/host_device_bandwidth.dir/micro/host_device_bandwidth.cpp.o
[ 44%] Linking CXX executable matmulchain
[ 44%] Built target fdtd2d
Scanning dependencies of target sobel7
[ 46%] Building CXX object CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o
[ 47%] Linking CXX executable median
[ 47%] Built target atax
Scanning dependencies of target 2DConvolution
[ 48%] Building CXX object CMakeFiles/2DConvolution.dir/polybench/2DConvolution.cpp.o
[ 48%] Built target matmulchain
Scanning dependencies of target vec_add
[ 50%] Building CXX object CMakeFiles/vec_add.dir/single-kernel/vec_add.cpp.o
[ 51%] Linking CXX executable mvt
[ 51%] Built target median
Scanning dependencies of target dag_task_throughput_independent
[ 52%] Building CXX object CMakeFiles/dag_task_throughput_independent.dir/runtime/dag_task_throughput_independent.cpp.o
[ 53%] Linking CXX executable covariance
[ 53%] Built target mvt
Scanning dependencies of target blocked_transform
[ 55%] Building CXX object CMakeFiles/blocked_transform.dir/runtime/blocked_transform.cpp.o
[ 55%] Built target covariance
Scanning dependencies of target lin_reg_error
[ 56%] Building CXX object CMakeFiles/lin_reg_error.dir/single-kernel/lin_reg_error.cpp.o
[ 57%] Linking CXX executable kmeans
[ 57%] Built target kmeans
Scanning dependencies of target segmentedreduction
[ 59%] Building CXX object CMakeFiles/segmentedreduction.dir/pattern/segmentedreduction.cpp.o
clang-11: /tmp/llvm-sycl-cuda/clang/lib/CodeGen/CGExprAgg.cpp:1862: void clang::CodeGen::CodeGenFunction::EmitAggExpr(const clang::Expr*, clang::CodeGen::AggValueSlot): Assertion `E && hasAggregateEvaluationKind(E->getType()) && "Invalid aggregate expression to emit"' failed.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0.	Program arguments: /tmp/llvm-sycl-cuda/build/install/bin/clang-11 -cc1 -triple nvptx64-nvidia-cuda-sycldevice -fsycl -fsycl-is-device -fdeclare-spirv-builtins -aux-triple x86_64-unknown-linux-gnu -Wno-sycl-strict -sycl-std=2017 -emit-llvm-bc -emit-llvm-uselists -disable-free -main-file-name sobel.cpp -mrelocation-model static -mthread-model posix -mframe-pointer=all -fno-rounding-math -fno-verbose-asm -no-integrated-as -aux-target-cpu x86-64 -internal-isystem /tmp/llvm-sycl-cuda/build/install/bin/../include/sycl -mlink-builtin-bitcode /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0/../../clc/libspirv-nvptx64--nvidiacl.bc -mlink-builtin-bitcode /usr/local/cuda-10.1/nvvm/libdevice/libdevice.10.bc -target-feature +ptx64 -target-sdk-version=10.1 -target-cpu sm_30 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0 -internal-isystem /tmp/llvm-sycl-cuda/build/install/bin/../include/sycl -D SYCL_BENCH_ENABLE_QUEUE_PROFILING -D __LLVM_SYCL_CUDA__ -I /workspace/codes/sycl-bench/include -I /workspace/codes/sycl-bench/polybench/common -I /tmp/llvm-sycl-cuda/build/install/include/sycl -I /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -internal-isystem /usr/local/include -internal-isystem /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -std=c++17 -fdeprecated-macro -fno-dwarf-directory-asm -fdebug-compilation-dir /workspace/codes/sycl-bench/bin/dpc++-cuda-kt-benchmarks -ferror-limit 19 -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fcolor-diagnostics -vectorize-loops -vectorize-slp -fsycl-unnamed-lambda -o /tmp/sobel-f12071.bc -x c++ /workspace/codes/sycl-bench/single-kernel/sobel.cpp 
1.	<eof> parser at end of file
2.	Per-file LLVM IR generation
3.	/workspace/codes/sycl-bench/single-kernel/sobel.cpp:50:57: Generating code for declaration '_ZTS16SobelBenchKernel'
 #0 0x000055bc4d459daa llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2034daa)
 #1 0x000055bc4d457aa4 llvm::sys::RunSignalHandlers() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2032aa4)
 #2 0x000055bc4d457bf3 SignalHandler(int) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2032bf3)
 #3 0x00007f3a9cfaa890 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12890)
 #4 0x00007f3a9bc5be97 raise (/lib/x86_64-linux-gnu/libc.so.6+0x3ee97)
 #5 0x00007f3a9bc5d801 abort (/lib/x86_64-linux-gnu/libc.so.6+0x40801)
 #6 0x00007f3a9bc4d39a (/lib/x86_64-linux-gnu/libc.so.6+0x3039a)
 #7 0x00007f3a9bc4d412 (/lib/x86_64-linux-gnu/libc.so.6+0x30412)
 #8 0x000055bc4d99becc clang::CodeGen::CodeGenFunction::EmitAggExpr(clang::Expr const*, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2576ecc)
 #9 0x000055bc4d99c723 (anonymous namespace)::AggExprEmitter::EmitInitializationToLValue(clang::Expr*, clang::CodeGen::LValue) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2577723)
#10 0x000055bc4bf81190 (anonymous namespace)::AggExprEmitter::VisitInitListExpr(clang::InitListExpr*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xb5c190)
#11 0x000055bc4d99b3e3 (anonymous namespace)::AggExprEmitter::Visit(clang::Expr*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x25763e3)
#12 0x000055bc4d99bcce clang::CodeGen::CodeGenFunction::EmitAggExpr(clang::Expr const*, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2576cce)
#13 0x000055bc4d959fe1 clang::CodeGen::CodeGenFunction::EmitExprAsInit(clang::Expr const*, clang::ValueDecl const*, clang::CodeGen::LValue, bool) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2534fe1)
#14 0x000055bc4d95ecc1 clang::CodeGen::CodeGenFunction::EmitAutoVarInit(clang::CodeGen::CodeGenFunction::AutoVarEmission const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2539cc1)
#15 0x000055bc4d962b8a clang::CodeGen::CodeGenFunction::EmitAutoVarDecl(clang::VarDecl const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x253db8a)
#16 0x000055bc4d962f53 clang::CodeGen::CodeGenFunction::EmitVarDecl(clang::VarDecl const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x253df53)
#17 0x000055bc4d963350 clang::CodeGen::CodeGenFunction::EmitDecl(clang::Decl const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x253e350)
#18 0x000055bc4d71e45f clang::CodeGen::CodeGenFunction::EmitDeclStmt(clang::DeclStmt const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x22f945f)
#19 0x000055bc4d72c5a5 clang::CodeGen::CodeGenFunction::EmitSimpleStmt(clang::Stmt const*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x23075a5)
#20 0x000055bc4d728602 clang::CodeGen::CodeGenFunction::EmitStmt(clang::Stmt const*, llvm::ArrayRef<clang::Attr const*>) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2303602)
#21 0x000055bc4d728e4c clang::CodeGen::CodeGenFunction::EmitCompoundStmtWithoutScope(clang::CompoundStmt const&, bool, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2303e4c)
#22 0x000055bc4d76c527 clang::CodeGen::CodeGenFunction::EmitFunctionBody(clang::Stmt const*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2347527)
#23 0x000055bc4d77bff5 clang::CodeGen::CodeGenFunction::GenerateCode(clang::GlobalDecl, llvm::Function*, clang::CodeGen::CGFunctionInfo const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2356ff5)
#24 0x000055bc4d7b7955 clang::CodeGen::CodeGenModule::EmitGlobalFunctionDefinition(clang::GlobalDecl, llvm::GlobalValue*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2392955)
#25 0x000055bc4d7b52a5 clang::CodeGen::CodeGenModule::EmitGlobalDefinition(clang::GlobalDecl, llvm::GlobalValue*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x23902a5)
#26 0x000055bc4d7bc59f clang::CodeGen::CodeGenModule::EmitDeferred() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x239759f)
#27 0x000055bc4d7bc72c clang::CodeGen::CodeGenModule::Release() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x239772c)
#28 0x000055bc4e254397 (anonymous namespace)::CodeGeneratorImpl::HandleTranslationUnit(clang::ASTContext&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2e2f397)
#29 0x000055bc4e252b55 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2e2db55)
#30 0x000055bc4eb12029 clang::ParseAST(clang::Sema&, bool, bool) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x36ed029)
#31 0x000055bc4e251a18 clang::CodeGenAction::ExecuteAction() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2e2ca18)
#32 0x000055bc4dc5ef09 clang::FrontendAction::Execute() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2839f09)
#33 0x000055bc4dc1b052 clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x27f6052)
#34 0x000055bc4dd0e4b1 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x28e94b1)
#35 0x000055bc4c259414 cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xe34414)
#36 0x000055bc4c255839 ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xe30839)
#37 0x000055bc4c1d81f2 main (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xdb31f2)
#38 0x00007f3a9bc3eb97 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x21b97)
#39 0x000055bc4c25539a _start (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xe3039a)
clang-11: error: unable to execute command: Aborted (core dumped)
clang-11: error: clang frontend command failed due to signal (use -v to see invocation)
clang version 11.0.0 
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /tmp/llvm-sycl-cuda/build/install/bin
clang-11: /tmp/llvm-sycl-cuda/clang/lib/CodeGen/CGExprAgg.cpp:1862: void clang::CodeGen::CodeGenFunction::EmitAggExpr(const clang::Expr*, clang::CodeGen::AggValueSlot): Assertion `E && hasAggregateEvaluationKind(E->getType()) && "Invalid aggregate expression to emit"' failed.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0.	Program arguments: /tmp/llvm-sycl-cuda/build/install/bin/clang-11 -cc1 -triple nvptx64-nvidia-cuda-sycldevice -fsycl -fsycl-is-device -fdeclare-spirv-builtins -aux-triple x86_64-unknown-linux-gnu -Wno-sycl-strict -sycl-std=2017 -emit-llvm-bc -emit-llvm-uselists -disable-free -main-file-name sobel7.cpp -mrelocation-model static -mthread-model posix -mframe-pointer=all -fno-rounding-math -fno-verbose-asm -no-integrated-as -aux-target-cpu x86-64 -internal-isystem /tmp/llvm-sycl-cuda/build/install/bin/../include/sycl -mlink-builtin-bitcode /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0/../../clc/libspirv-nvptx64--nvidiacl.bc -mlink-builtin-bitcode /usr/local/cuda-10.1/nvvm/libdevice/libdevice.10.bc -target-feature +ptx64 -target-sdk-version=10.1 -target-cpu sm_30 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0 -internal-isystem /tmp/llvm-sycl-cuda/build/install/bin/../include/sycl -D SYCL_BENCH_ENABLE_QUEUE_PROFILING -D __LLVM_SYCL_CUDA__ -I /workspace/codes/sycl-bench/include -I /workspace/codes/sycl-bench/polybench/common -I /tmp/llvm-sycl-cuda/build/install/include/sycl -I /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -internal-isystem /usr/local/include -internal-isystem /tmp/llvm-sycl-cuda/build/install/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -std=c++17 -fdeprecated-macro -fno-dwarf-directory-asm -fdebug-compilation-dir /workspace/codes/sycl-bench/bin/dpc++-cuda-kt-benchmarks -ferror-limit 19 -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -fcolor-diagnostics -vectorize-loops -vectorize-slp -fsycl-unnamed-lambda -o /tmp/sobel7-8bdef4.bc -x c++ /workspace/codes/sycl-bench/single-kernel/sobel7.cpp 
1.	<eof> parser at end of file
2.	Per-file LLVM IR generation
3.	/workspace/codes/sycl-bench/single-kernel/sobel7.cpp:52:52: Generating code for declaration '_ZTS17Sobel7BenchKernel'
 #0 0x000055fea45f9daa llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2034daa)
 #1 0x000055fea45f7aa4 llvm::sys::RunSignalHandlers() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2032aa4)
 #2 0x000055fea45f7bf3 SignalHandler(int) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2032bf3)
 #3 0x00007f5d5c2c7890 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12890)
 #4 0x00007f5d5af78e97 raise (/lib/x86_64-linux-gnu/libc.so.6+0x3ee97)
 #5 0x00007f5d5af7a801 abort (/lib/x86_64-linux-gnu/libc.so.6+0x40801)
 #6 0x00007f5d5af6a39a (/lib/x86_64-linux-gnu/libc.so.6+0x3039a)
 #7 0x00007f5d5af6a412 (/lib/x86_64-linux-gnu/libc.so.6+0x30412)
 #8 0x000055fea4b3becc clang::CodeGen::CodeGenFunction::EmitAggExpr(clang::Expr const*, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2576ecc)
 #9 0x000055fea4b3c723 (anonymous namespace)::AggExprEmitter::EmitInitializationToLValue(clang::Expr*, clang::CodeGen::LValue) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2577723)
#10 0x000055fea3121190 (anonymous namespace)::AggExprEmitter::VisitInitListExpr(clang::InitListExpr*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xb5c190)
#11 0x000055fea4b3b3e3 (anonymous namespace)::AggExprEmitter::Visit(clang::Expr*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x25763e3)
#12 0x000055fea4b3bcce clang::CodeGen::CodeGenFunction::EmitAggExpr(clang::Expr const*, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2576cce)
#13 0x000055fea4af9fe1 clang::CodeGen::CodeGenFunction::EmitExprAsInit(clang::Expr const*, clang::ValueDecl const*, clang::CodeGen::LValue, bool) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2534fe1)
#14 0x000055fea4afecc1 clang::CodeGen::CodeGenFunction::EmitAutoVarInit(clang::CodeGen::CodeGenFunction::AutoVarEmission const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2539cc1)
#15 0x000055fea4b02b8a clang::CodeGen::CodeGenFunction::EmitAutoVarDecl(clang::VarDecl const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x253db8a)
#16 0x000055fea4b02f53 clang::CodeGen::CodeGenFunction::EmitVarDecl(clang::VarDecl const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x253df53)
#17 0x000055fea4b03350 clang::CodeGen::CodeGenFunction::EmitDecl(clang::Decl const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x253e350)
#18 0x000055fea48be45f clang::CodeGen::CodeGenFunction::EmitDeclStmt(clang::DeclStmt const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x22f945f)
#19 0x000055fea48cc5a5 clang::CodeGen::CodeGenFunction::EmitSimpleStmt(clang::Stmt const*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x23075a5)
#20 0x000055fea48c8602 clang::CodeGen::CodeGenFunction::EmitStmt(clang::Stmt const*, llvm::ArrayRef<clang::Attr const*>) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2303602)
#21 0x000055fea48c8e4c clang::CodeGen::CodeGenFunction::EmitCompoundStmtWithoutScope(clang::CompoundStmt const&, bool, clang::CodeGen::AggValueSlot) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2303e4c)
#22 0x000055fea490c527 clang::CodeGen::CodeGenFunction::EmitFunctionBody(clang::Stmt const*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2347527)
#23 0x000055fea491bff5 clang::CodeGen::CodeGenFunction::GenerateCode(clang::GlobalDecl, llvm::Function*, clang::CodeGen::CGFunctionInfo const&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2356ff5)
#24 0x000055fea4957955 clang::CodeGen::CodeGenModule::EmitGlobalFunctionDefinition(clang::GlobalDecl, llvm::GlobalValue*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2392955)
#25 0x000055fea49552a5 clang::CodeGen::CodeGenModule::EmitGlobalDefinition(clang::GlobalDecl, llvm::GlobalValue*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x23902a5)
#26 0x000055fea495c59f clang::CodeGen::CodeGenModule::EmitDeferred() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x239759f)
#27 0x000055fea495c72c clang::CodeGen::CodeGenModule::Release() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x239772c)
#28 0x000055fea53f4397 (anonymous namespace)::CodeGeneratorImpl::HandleTranslationUnit(clang::ASTContext&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2e2f397)
#29 0x000055fea53f2b55 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2e2db55)
#30 0x000055fea5cb2029 clang::ParseAST(clang::Sema&, bool, bool) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x36ed029)
#31 0x000055fea53f1a18 clang::CodeGenAction::ExecuteAction() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2e2ca18)
#32 0x000055fea4dfef09 clang::FrontendAction::Execute() (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x2839f09)
#33 0x000055fea4dbb052 clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x27f6052)
#34 0x000055fea4eae4b1 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0x28e94b1)
#35 0x000055fea33f9414 cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xe34414)
#36 0x000055fea33f5839 ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xe30839)
#37 0x000055fea33781f2 main (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xdb31f2)
#38 0x00007f5d5af5bb97 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x21b97)
#39 0x000055fea33f539a _start (/tmp/llvm-sycl-cuda/build/install/bin/clang-11+0xe3039a)
clang-11: error: unable to execute command: Aborted (core dumped)
clang-11: error: clang frontend command failed due to signal (use -v to see invocation)
clang version 11.0.0 
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /tmp/llvm-sycl-cuda/build/install/bin
clang-11: note: diagnostic msg: Error generating preprocessed source(s).
CMakeFiles/sobel.dir/build.make:62: recipe for target 'CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o' failed
make[2]: *** [CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o] Error 254
make[2]: Target 'CMakeFiles/sobel.dir/build' not remade because of errors.
CMakeFiles/Makefile2:849: recipe for target 'CMakeFiles/sobel.dir/all' failed
make[1]: *** [CMakeFiles/sobel.dir/all] Error 2
Scanning dependencies of target lin_reg_coeff
[ 60%] Building CXX object CMakeFiles/lin_reg_coeff.dir/single-kernel/lin_reg_coeff.cpp.o
clang-11: note: diagnostic msg: Error generating preprocessed source(s).
CMakeFiles/sobel7.dir/build.make:62: recipe for target 'CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o' failed
make[2]: *** [CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o] Error 254
make[2]: Target 'CMakeFiles/sobel7.dir/build' not remade because of errors.
CMakeFiles/Makefile2:960: recipe for target 'CMakeFiles/sobel7.dir/all' failed
make[1]: *** [CMakeFiles/sobel7.dir/all] Error 2
Scanning dependencies of target mol_dyn
[ 61%] Building CXX object CMakeFiles/mol_dyn.dir/single-kernel/mol_dyn.cpp.o
[ 63%] Linking CXX executable scalar_prod
[ 63%] Built target scalar_prod
Scanning dependencies of target gesummv
[ 64%] Building CXX object CMakeFiles/gesummv.dir/polybench/gesummv.cpp.o
[ 65%] Linking CXX executable reduction
[ 65%] Built target reduction
Scanning dependencies of target 2mm
[ 67%] Building CXX object CMakeFiles/2mm.dir/polybench/2mm.cpp.o
[ 68%] Linking CXX executable 2DConvolution
[ 68%] Built target 2DConvolution
Scanning dependencies of target sf
[ 69%] Building CXX object CMakeFiles/sf.dir/micro/sf.cpp.o
[ 71%] Linking CXX executable correlation
[ 71%] Built target correlation
Scanning dependencies of target nbody
[ 72%] Building CXX object CMakeFiles/nbody.dir/single-kernel/nbody.cpp.o
[ 73%] Linking CXX executable lin_reg_error
[ 75%] Linking CXX executable dag_task_throughput_independent
[ 75%] Built target dag_task_throughput_independent
Scanning dependencies of target dag_task_throughput_sequential
[ 75%] Built target lin_reg_error
[ 76%] Building CXX object CMakeFiles/dag_task_throughput_sequential.dir/runtime/dag_task_throughput_sequential.cpp.o
[ 77%] Linking CXX executable local_mem
[ 78%] Linking CXX executable blocked_transform
[ 80%] Linking CXX executable arith
[ 80%] Built target local_mem
[ 80%] Built target blocked_transform
[ 80%] Built target arith
[ 81%] Linking CXX executable vec_add
[ 81%] Built target vec_add
[ 82%] Linking CXX executable mol_dyn
[ 82%] Built target mol_dyn
[ 84%] Linking CXX executable lin_reg_coeff
[ 84%] Built target lin_reg_coeff
[ 85%] Linking CXX executable DRAM
[ 85%] Built target DRAM
[ 86%] Linking CXX executable host_device_bandwidth
[ 86%] Built target host_device_bandwidth
[ 88%] Linking CXX executable pattern_L2
[ 89%] Linking CXX executable segmentedreduction
[ 90%] Linking CXX executable gesummv
[ 90%] Built target pattern_L2
[ 90%] Built target segmentedreduction
[ 92%] Linking CXX executable 2mm
[ 92%] Built target gesummv
[ 92%] Built target 2mm
[ 93%] Linking CXX executable sf
ptxas fatal   : Unresolved extern function '_Z15__spirv_ocl_tanf'
clang-11: error: ptxas command failed with exit code 255 (use -v to see invocation)
CMakeFiles/sf.dir/build.make:83: recipe for target 'sf' failed
make[2]: *** [sf] Error 255
make[2]: Target 'CMakeFiles/sf.dir/build' not remade because of errors.
CMakeFiles/Makefile2:1367: recipe for target 'CMakeFiles/sf.dir/all' failed
make[1]: *** [CMakeFiles/sf.dir/all] Error 2
[ 94%] Linking CXX executable dag_task_throughput_sequential
[ 94%] Built target dag_task_throughput_sequential
[ 96%] Linking CXX executable nbody
ptxas fatal   : Unresolved extern function '_Z17__spirv_ocl_rsqrtf'
clang-11: error: ptxas command failed with exit code 255 (use -v to see invocation)
CMakeFiles/nbody.dir/build.make:83: recipe for target 'nbody' failed
make[2]: *** [nbody] Error 255
make[2]: Target 'CMakeFiles/nbody.dir/build' not remade because of errors.
CMakeFiles/Makefile2:1404: recipe for target 'CMakeFiles/nbody.dir/all' failed
make[1]: *** [CMakeFiles/nbody.dir/all] Error 2
make[1]: Target 'all' not remade because of errors.
Makefile:129: recipe for target 'all' failed
make: *** [all] Error 2
make: Target 'default_target' not remade because of errors.

Update the default symlink for the running suite to operate on.

In [61]:
! rm -r ./benchmarks
! ln -s ./dpc++-cuda-benchmarks ./benchmarks

Run the benchmarks on GPU:

In [ ]:
! rm -r ./sycl-bench.csv
! LD_LIBRARY_PATH=${LD_LIBARARY_PATH}:/tmp/llvm-sycl-cuda/build/install/lib ./run-suite gpu
Using test profile: gpu


##################################################
Processing host_device_bandwidth
##################################################
0.0 10800.0
__________________________________________________

host_device_bandwidth --num-runs=50 --output=./sycl-bench.csv --device=gpu --size=1024 --local=256
==> Benchmark FAILED: /workspace/codes/sycl-bench/bin/dpc++-cuda-kt-benchmarks/host_device_bandwidth with args ['--num-runs=50', '--output=./sycl-bench.csv', '--device=gpu', '--size=1024', '--local=256']
Benchmark failed, aborting run


##################################################
Processing blocked_transform
##################################################
0.0 10800.0
__________________________________________________

blocked_transform --num-runs=50 --output=./sycl-bench.csv --device=gpu --size=1048576 --local=256
==> Benchmark FAILED: /workspace/codes/sycl-bench/bin/dpc++-cuda-kt-benchmarks/blocked_transform with args ['--num-runs=50', '--output=./sycl-bench.csv', '--device=gpu', '--size=1048576', '--local=256']
Benchmark failed, aborting run


##################################################
Processing correlation
##################################################
0.0 10800.0
__________________________________________________

correlation --num-runs=50 --output=./sycl-bench.csv --device=gpu --size=1024 --local=256
==> Benchmark run finished in 1.664168017989141 s


##################################################
Processing mvt
##################################################
0.0 10800.0
__________________________________________________

mvt --num-runs=50 --output=./sycl-bench.csv --device=gpu --size=16384 --local=256

Rename the results to something more descriptive.

In [1]:
! tail -c +1 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-p100-kt-dpc++-cuda.csv

hipSYCL

CPU

Compile with hipSYCL-CPU and remove non-applications from the final build.

In [5]:
! rm -r ./hipsycl-cpu-kt-benchmarks
! mkdir ./hipsycl-cpu-kt-benchmarks && cd ./hipsycl-cpu-kt-benchmarks && cmake ../.. -DSYCL_IMPL=hipSYCL -DhipSYCL_DIR=/opt/hipSYCL/lib/cmake -DHIPSYCL_PLATFORM=cpu && make -j16 --keep-going && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
-- The C compiler identification is Clang 9.0.1
-- The CXX compiler identification is Clang 9.0.1
-- Check for working C compiler: /llvm-9.0.1/bin/clang
-- Check for working C compiler: /llvm-9.0.1/bin/clang -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Configuring done
-- Generating done
-- Build files have been written to: /workspace/codes/sycl-bench/bin/hipsycl-cpu-kt-benchmarks
Scanning dependencies of target syrk
Scanning dependencies of target mvt
Scanning dependencies of target gramschmidt
Scanning dependencies of target gemm
Scanning dependencies of target covariance
Scanning dependencies of target fdtd2d
Scanning dependencies of target syr2k
Scanning dependencies of target 3mm
Scanning dependencies of target 3DConvolution
Scanning dependencies of target matmulchain
Scanning dependencies of target bicg
Scanning dependencies of target median
Scanning dependencies of target atax
Scanning dependencies of target kmeans
Scanning dependencies of target sobel5
Scanning dependencies of target scalar_prod
[  1%] Building SYCL object CMakeFiles/syrk.dir/polybench/syrk.cpp.o
[  2%] Building SYCL object CMakeFiles/mvt.dir/polybench/mvt.cpp.o
[  3%] Building SYCL object CMakeFiles/gemm.dir/polybench/gemm.cpp.o
[  5%] Building SYCL object CMakeFiles/covariance.dir/polybench/covariance.cpp.o
[  9%] Building SYCL object CMakeFiles/syr2k.dir/polybench/syr2k.cpp.o
[  9%] Building SYCL object CMakeFiles/gramschmidt.dir/polybench/gramschmidt.cpp.o
[  9%] Building SYCL object CMakeFiles/fdtd2d.dir/polybench/fdtd2d.cpp.o
[ 10%] Building SYCL object CMakeFiles/3mm.dir/polybench/3mm.cpp.o
[ 11%] Building SYCL object CMakeFiles/bicg.dir/polybench/bicg.cpp.o
[ 15%] Building SYCL object CMakeFiles/3DConvolution.dir/polybench/3DConvolution.cpp.o
[ 15%] Building SYCL object CMakeFiles/median.dir/single-kernel/median.cpp.o
[ 15%] Building SYCL object CMakeFiles/atax.dir/polybench/atax.cpp.o
[ 17%] Building SYCL object CMakeFiles/matmulchain.dir/runtime/matmulchain.cpp.o
[ 18%] Building SYCL object CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o
[ 19%] Building SYCL object CMakeFiles/scalar_prod.dir/single-kernel/scalar_prod.cpp.o
[ 21%] Building SYCL object CMakeFiles/kmeans.dir/single-kernel/kmeans.cpp.o
/workspace/codes/sycl-bench/single-kernel/median.cpp:70:54: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint xs = s::min(s::max(static_cast<s::cl_int>(x+j), static_cast<s::cl_int>(0)), static_cast<int>(size_-1)); // borders are handled here with extended values
                                                  ~~~^
/workspace/codes/sycl-bench/single-kernel/median.cpp:70:83: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint xs = s::min(s::max(static_cast<s::cl_int>(x+j), static_cast<s::cl_int>(0)), static_cast<int>(size_-1)); // borders are handled here with extended values
                                                                               ~~~^
/workspace/codes/sycl-bench/single-kernel/median.cpp:71:54: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint ys = s::min(s::max(static_cast<s::cl_int>(y+i), static_cast<s::cl_int>(0)), static_cast<int>(size_-1));
                                                  ~~~^
/workspace/codes/sycl-bench/single-kernel/median.cpp:71:83: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint ys = s::min(s::max(static_cast<s::cl_int>(y+i), static_cast<s::cl_int>(0)), static_cast<int>(size_-1));
                                                                               ~~~^
4 errors generated.
CMakeFiles/median.dir/build.make:62: recipe for target 'CMakeFiles/median.dir/single-kernel/median.cpp.o' failed
make[2]: *** [CMakeFiles/median.dir/single-kernel/median.cpp.o] Error 1
make[2]: Target 'CMakeFiles/median.dir/build' not remade because of errors.
CMakeFiles/Makefile2:479: recipe for target 'CMakeFiles/median.dir/all' failed
make[1]: *** [CMakeFiles/median.dir/all] Error 2
Scanning dependencies of target reduction
[ 22%] Building SYCL object CMakeFiles/reduction.dir/pattern/reduction.cpp.o
[ 23%] Linking SYCL executable syrk
[ 25%] Linking SYCL executable gemm
[ 25%] Built target syrk
Scanning dependencies of target DRAM
[ 26%] Building SYCL object CMakeFiles/DRAM.dir/micro/DRAM.cpp.o
[ 27%] Linking SYCL executable matmulchain
[ 28%] Linking SYCL executable syr2k
[ 30%] Linking SYCL executable 3DConvolution
[ 30%] Built target gemm
Scanning dependencies of target arith
[ 31%] Building SYCL object CMakeFiles/arith.dir/micro/arith.cpp.o
[ 31%] Built target matmulchain
[ 31%] Built target syr2k
Scanning dependencies of target local_mem
[ 32%] Building SYCL object CMakeFiles/local_mem.dir/micro/local_mem.cpp.o
Scanning dependencies of target correlation
[ 34%] Building SYCL object CMakeFiles/correlation.dir/polybench/correlation.cpp.o
[ 34%] Built target 3DConvolution
Scanning dependencies of target sobel
[ 35%] Building SYCL object CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o
[ 36%] Linking SYCL executable mvt
[ 38%] Linking SYCL executable sobel5
[ 39%] Linking SYCL executable gramschmidt
[ 40%] Linking SYCL executable bicg
[ 42%] Linking SYCL executable 3mm
[ 42%] Built target mvt
[ 43%] Linking SYCL executable atax
Scanning dependencies of target pattern_L2
[ 44%] Building SYCL object CMakeFiles/pattern_L2.dir/micro/pattern_L2.cpp.o
[ 44%] Built target sobel5
Scanning dependencies of target host_device_bandwidth
[ 46%] Building SYCL object CMakeFiles/host_device_bandwidth.dir/micro/host_device_bandwidth.cpp.o
[ 46%] Built target bicg
[ 46%] Built target gramschmidt
[ 46%] Built target 3mm
Scanning dependencies of target sobel7
Scanning dependencies of target 2DConvolution
Scanning dependencies of target vec_add
[ 48%] Building SYCL object CMakeFiles/2DConvolution.dir/polybench/2DConvolution.cpp.o
[ 48%] Building SYCL object CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o
[ 50%] Building SYCL object CMakeFiles/vec_add.dir/single-kernel/vec_add.cpp.o
[ 50%] Built target atax
Scanning dependencies of target dag_task_throughput_independent
[ 51%] Building SYCL object CMakeFiles/dag_task_throughput_independent.dir/runtime/dag_task_throughput_independent.cpp.o
[ 52%] Linking SYCL executable covariance
[ 53%] Linking SYCL executable fdtd2d
[ 53%] Built target covariance
Scanning dependencies of target blocked_transform
[ 55%] Building SYCL object CMakeFiles/blocked_transform.dir/runtime/blocked_transform.cpp.o
[ 55%] Built target fdtd2d
Scanning dependencies of target lin_reg_error
[ 56%] Building SYCL object CMakeFiles/lin_reg_error.dir/single-kernel/lin_reg_error.cpp.o
[ 57%] Linking SYCL executable kmeans
[ 57%] Built target kmeans
Scanning dependencies of target segmentedreduction
[ 59%] Building SYCL object CMakeFiles/segmentedreduction.dir/pattern/segmentedreduction.cpp.o
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
        [=](sycl::group<1> grp) {
        ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
              [=](cl::sycl::group<1> grp){
              ^
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
[ 60%] Linking SYCL executable 2DConvolution
[ 60%] Built target 2DConvolution
Scanning dependencies of target lin_reg_coeff
[ 61%] Building SYCL object CMakeFiles/lin_reg_coeff.dir/single-kernel/lin_reg_coeff.cpp.o
[ 63%] Linking SYCL executable sobel
[ 64%] Linking SYCL executable sobel7
[ 64%] Built target sobel
Scanning dependencies of target mol_dyn
[ 65%] Building SYCL object CMakeFiles/mol_dyn.dir/single-kernel/mol_dyn.cpp.o
[ 65%] Built target sobel7
Scanning dependencies of target gesummv
[ 67%] Building SYCL object CMakeFiles/gesummv.dir/polybench/gesummv.cpp.o
[ 68%] Linking SYCL executable correlation
[ 68%] Built target correlation
Scanning dependencies of target 2mm
[ 69%] Building SYCL object CMakeFiles/2mm.dir/polybench/2mm.cpp.o
[ 71%] Linking SYCL executable lin_reg_error
[ 72%] Linking SYCL executable dag_task_throughput_independent
[ 73%] Linking SYCL executable blocked_transform
[ 73%] Built target lin_reg_error
Scanning dependencies of target sf
[ 75%] Building SYCL object CMakeFiles/sf.dir/micro/sf.cpp.o
[ 75%] Built target dag_task_throughput_independent
Scanning dependencies of target nbody
[ 76%] Building SYCL object CMakeFiles/nbody.dir/single-kernel/nbody.cpp.o
[ 76%] Built target blocked_transform
Scanning dependencies of target dag_task_throughput_sequential
[ 77%] Building SYCL object CMakeFiles/dag_task_throughput_sequential.dir/runtime/dag_task_throughput_sequential.cpp.o
[ 78%] Linking SYCL executable arith
[ 78%] Built target arith
[ 80%] Linking SYCL executable local_mem
12 warnings generated.
[ 81%] Linking SYCL executable reduction
[ 82%] Linking SYCL executable vec_add
[ 82%] Built target local_mem
[ 82%] Built target reduction
[ 82%] Built target vec_add
12 warnings generated.
[ 84%] Linking SYCL executable scalar_prod
[ 84%] Built target scalar_prod
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
        [=](sycl::group<1> grp) {
        ^
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
[ 85%] Linking SYCL executable mol_dyn
[ 85%] Built target mol_dyn
[ 86%] Linking SYCL executable DRAM
[ 88%] Linking SYCL executable gesummv
[ 88%] Built target DRAM
[ 88%] Built target gesummv
[ 89%] Linking SYCL executable 2mm
[ 90%] Linking SYCL executable lin_reg_coeff
[ 90%] Built target 2mm
[ 90%] Built target lin_reg_coeff
[ 92%] Linking SYCL executable host_device_bandwidth
[ 92%] Built target host_device_bandwidth
[ 93%] Linking SYCL executable sf
[ 93%] Built target sf
[ 94%] Linking SYCL executable dag_task_throughput_sequential
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:231:11: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
          [=, dt = this->dt, gravitational_softening = this->gravitational_softening](sycl::group<1> grp) {
          ^
In file included from /workspace/codes/sycl-bench/single-kernel/nbody.cpp:1:
In file included from /workspace/codes/sycl-bench/include/common.h:2:
In file included from /opt/hipSYCL/bin/../include/CL/sycl.hpp:31:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/sycl.hpp:43:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/queue.hpp:41:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/handler.hpp:44:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/nd_item.hpp:36:
/opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/group.hpp:430:8: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
  void iterate_over_work_items(const range<1> iteration_range,
       ^
[ 94%] Built target dag_task_throughput_sequential
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:231:11: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
          [=, dt = this->dt, gravitational_softening = this->gravitational_softening](sycl::group<1> grp) {
          ^
15 warnings generated.
[ 96%] Linking SYCL executable segmentedreduction
[ 96%] Built target segmentedreduction
[ 97%] Linking SYCL executable pattern_L2
[ 97%] Built target pattern_L2
3 warnings generated.
[ 98%] Linking SYCL executable nbody
[ 98%] Built target nbody
make[1]: Target 'all' not remade because of errors.
Makefile:129: recipe for target 'all' failed
make: *** [all] Error 2
make: Target 'default_target' not remade because of errors.

Update the default symlink for the running suite to operate on.

In [6]:
! rm -r ./benchmarks
! ln -s ./hipsycl-cpu-kt-benchmarks ./benchmarks

Run the benchmarks on CPU:

In [7]:
! ./run-suite cpu
Using test profile: cpu


##################################################
Processing host_device_bandwidth
##################################################
0.0 10800.0
__________________________________________________

host_device_bandwidth --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: 3D copy() is currently not supported on this platform
SYCL error: 3D copy() is currently not supported on this platform
==> Benchmark run finished in 473.6365746310039 s


##################################################
Processing sf
##################################################
0.0 10800.0
__________________________________________________

sf --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 0.05814825801644474 s


##################################################
Processing blocked_transform
##################################################
0.0 10800.0
__________________________________________________

blocked_transform --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
==> Benchmark run finished in 2980.474010874983 s


##################################################
Processing Makefile
##################################################
0.0 10800.0
__________________________________________________

Makefile --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
Traceback (most recent call last):
  File "./run-suite", line 249, in <module>
    retcode, elapsed_time = invoke_benchmark(benchmark_executable, args)
  File "./run-suite", line 147, in invoke_benchmark
    retcode = subprocess.call([benchmark_executable]+args)
  File "/usr/lib/python3.6/subprocess.py", line 287, in call
    with Popen(*popenargs, **kwargs) as p:
  File "/usr/lib/python3.6/subprocess.py", line 729, in __init__
    restore_signals, start_new_session)
  File "/usr/lib/python3.6/subprocess.py", line 1364, in _execute_child
    raise child_exception_type(errno_num, err_msg, err_filename)
PermissionError: [Errno 13] Permission denied: '/workspace/codes/sycl-bench/bin/hipsycl-cpu-kt-benchmarks/Makefile'

Rename the results to something more descriptive.

In [42]:
! tail -c +1 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-gold-kt-hipsycl-cpu.csv
ROCm

Compile with hipSYCL-rocm and remove non-applications from the final build.

In [5]:
! rm -r ./hipsycl-rocm-benchmarks
! mkdir ./hipsycl-rocm-benchmarks && cd ./hipsycl-rocm-benchmarks && cmake .. -DSYCL_IMPL=hipSYCL -DhipSYCL_DIR=/opt/hipSYCL/lib/cmake -DHIPSYCL_PLATFORM=rocm -DHIPSYCL_GPU_ARCH=gfx906 && make -j16 && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
-- The C compiler identification is Clang 9.0.1
-- The CXX compiler identification is Clang 9.0.1
-- Check for working C compiler: /llvm-9.0.1/bin/clang
-- Check for working C compiler: /llvm-9.0.1/bin/clang -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Configuring done
-- Generating done
-- Build files have been written to: /workspace/codes/sycl-bench/bin/hipsycl-cpu-kt-benchmarks
Scanning dependencies of target syrk
Scanning dependencies of target mvt
Scanning dependencies of target gramschmidt
Scanning dependencies of target gemm
Scanning dependencies of target covariance
Scanning dependencies of target fdtd2d
Scanning dependencies of target syr2k
Scanning dependencies of target 3mm
Scanning dependencies of target 3DConvolution
Scanning dependencies of target matmulchain
Scanning dependencies of target bicg
Scanning dependencies of target median
Scanning dependencies of target atax
Scanning dependencies of target kmeans
Scanning dependencies of target sobel5
Scanning dependencies of target scalar_prod
[  1%] Building SYCL object CMakeFiles/syrk.dir/polybench/syrk.cpp.o
[  2%] Building SYCL object CMakeFiles/mvt.dir/polybench/mvt.cpp.o
[  3%] Building SYCL object CMakeFiles/gemm.dir/polybench/gemm.cpp.o
[  5%] Building SYCL object CMakeFiles/covariance.dir/polybench/covariance.cpp.o
[  9%] Building SYCL object CMakeFiles/syr2k.dir/polybench/syr2k.cpp.o
[  9%] Building SYCL object CMakeFiles/gramschmidt.dir/polybench/gramschmidt.cpp.o
[  9%] Building SYCL object CMakeFiles/fdtd2d.dir/polybench/fdtd2d.cpp.o
[ 10%] Building SYCL object CMakeFiles/3mm.dir/polybench/3mm.cpp.o
[ 11%] Building SYCL object CMakeFiles/bicg.dir/polybench/bicg.cpp.o
[ 15%] Building SYCL object CMakeFiles/3DConvolution.dir/polybench/3DConvolution.cpp.o
[ 15%] Building SYCL object CMakeFiles/median.dir/single-kernel/median.cpp.o
[ 15%] Building SYCL object CMakeFiles/atax.dir/polybench/atax.cpp.o
[ 17%] Building SYCL object CMakeFiles/matmulchain.dir/runtime/matmulchain.cpp.o
[ 18%] Building SYCL object CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o
[ 19%] Building SYCL object CMakeFiles/scalar_prod.dir/single-kernel/scalar_prod.cpp.o
[ 21%] Building SYCL object CMakeFiles/kmeans.dir/single-kernel/kmeans.cpp.o
/workspace/codes/sycl-bench/single-kernel/median.cpp:70:54: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint xs = s::min(s::max(static_cast<s::cl_int>(x+j), static_cast<s::cl_int>(0)), static_cast<int>(size_-1)); // borders are handled here with extended values
                                                  ~~~^
/workspace/codes/sycl-bench/single-kernel/median.cpp:70:83: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint xs = s::min(s::max(static_cast<s::cl_int>(x+j), static_cast<s::cl_int>(0)), static_cast<int>(size_-1)); // borders are handled here with extended values
                                                                               ~~~^
/workspace/codes/sycl-bench/single-kernel/median.cpp:71:54: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint ys = s::min(s::max(static_cast<s::cl_int>(y+i), static_cast<s::cl_int>(0)), static_cast<int>(size_-1));
                                                  ~~~^
/workspace/codes/sycl-bench/single-kernel/median.cpp:71:83: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint ys = s::min(s::max(static_cast<s::cl_int>(y+i), static_cast<s::cl_int>(0)), static_cast<int>(size_-1));
                                                                               ~~~^
4 errors generated.
CMakeFiles/median.dir/build.make:62: recipe for target 'CMakeFiles/median.dir/single-kernel/median.cpp.o' failed
make[2]: *** [CMakeFiles/median.dir/single-kernel/median.cpp.o] Error 1
make[2]: Target 'CMakeFiles/median.dir/build' not remade because of errors.
CMakeFiles/Makefile2:479: recipe for target 'CMakeFiles/median.dir/all' failed
make[1]: *** [CMakeFiles/median.dir/all] Error 2
Scanning dependencies of target reduction
[ 22%] Building SYCL object CMakeFiles/reduction.dir/pattern/reduction.cpp.o
[ 23%] Linking SYCL executable syrk
[ 25%] Linking SYCL executable gemm
[ 25%] Built target syrk
Scanning dependencies of target DRAM
[ 26%] Building SYCL object CMakeFiles/DRAM.dir/micro/DRAM.cpp.o
[ 27%] Linking SYCL executable matmulchain
[ 28%] Linking SYCL executable syr2k
[ 30%] Linking SYCL executable 3DConvolution
[ 30%] Built target gemm
Scanning dependencies of target arith
[ 31%] Building SYCL object CMakeFiles/arith.dir/micro/arith.cpp.o
[ 31%] Built target matmulchain
[ 31%] Built target syr2k
Scanning dependencies of target local_mem
[ 32%] Building SYCL object CMakeFiles/local_mem.dir/micro/local_mem.cpp.o
Scanning dependencies of target correlation
[ 34%] Building SYCL object CMakeFiles/correlation.dir/polybench/correlation.cpp.o
[ 34%] Built target 3DConvolution
Scanning dependencies of target sobel
[ 35%] Building SYCL object CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o
[ 36%] Linking SYCL executable mvt
[ 38%] Linking SYCL executable sobel5
[ 39%] Linking SYCL executable gramschmidt
[ 40%] Linking SYCL executable bicg
[ 42%] Linking SYCL executable 3mm
[ 42%] Built target mvt
[ 43%] Linking SYCL executable atax
Scanning dependencies of target pattern_L2
[ 44%] Building SYCL object CMakeFiles/pattern_L2.dir/micro/pattern_L2.cpp.o
[ 44%] Built target sobel5
Scanning dependencies of target host_device_bandwidth
[ 46%] Building SYCL object CMakeFiles/host_device_bandwidth.dir/micro/host_device_bandwidth.cpp.o
[ 46%] Built target bicg
[ 46%] Built target gramschmidt
[ 46%] Built target 3mm
Scanning dependencies of target sobel7
Scanning dependencies of target 2DConvolution
Scanning dependencies of target vec_add
[ 48%] Building SYCL object CMakeFiles/2DConvolution.dir/polybench/2DConvolution.cpp.o
[ 48%] Building SYCL object CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o
[ 50%] Building SYCL object CMakeFiles/vec_add.dir/single-kernel/vec_add.cpp.o
[ 50%] Built target atax
Scanning dependencies of target dag_task_throughput_independent
[ 51%] Building SYCL object CMakeFiles/dag_task_throughput_independent.dir/runtime/dag_task_throughput_independent.cpp.o
[ 52%] Linking SYCL executable covariance
[ 53%] Linking SYCL executable fdtd2d
[ 53%] Built target covariance
Scanning dependencies of target blocked_transform
[ 55%] Building SYCL object CMakeFiles/blocked_transform.dir/runtime/blocked_transform.cpp.o
[ 55%] Built target fdtd2d
Scanning dependencies of target lin_reg_error
[ 56%] Building SYCL object CMakeFiles/lin_reg_error.dir/single-kernel/lin_reg_error.cpp.o
[ 57%] Linking SYCL executable kmeans
[ 57%] Built target kmeans
Scanning dependencies of target segmentedreduction
[ 59%] Building SYCL object CMakeFiles/segmentedreduction.dir/pattern/segmentedreduction.cpp.o
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
        [=](sycl::group<1> grp) {
        ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
              [=](cl::sycl::group<1> grp){
              ^
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
[ 60%] Linking SYCL executable 2DConvolution
[ 60%] Built target 2DConvolution
Scanning dependencies of target lin_reg_coeff
[ 61%] Building SYCL object CMakeFiles/lin_reg_coeff.dir/single-kernel/lin_reg_coeff.cpp.o
[ 63%] Linking SYCL executable sobel
[ 64%] Linking SYCL executable sobel7
[ 64%] Built target sobel
Scanning dependencies of target mol_dyn
[ 65%] Building SYCL object CMakeFiles/mol_dyn.dir/single-kernel/mol_dyn.cpp.o
[ 65%] Built target sobel7
Scanning dependencies of target gesummv
[ 67%] Building SYCL object CMakeFiles/gesummv.dir/polybench/gesummv.cpp.o
[ 68%] Linking SYCL executable correlation
[ 68%] Built target correlation
Scanning dependencies of target 2mm
[ 69%] Building SYCL object CMakeFiles/2mm.dir/polybench/2mm.cpp.o
[ 71%] Linking SYCL executable lin_reg_error
[ 72%] Linking SYCL executable dag_task_throughput_independent
[ 73%] Linking SYCL executable blocked_transform
[ 73%] Built target lin_reg_error
Scanning dependencies of target sf
[ 75%] Building SYCL object CMakeFiles/sf.dir/micro/sf.cpp.o
[ 75%] Built target dag_task_throughput_independent
Scanning dependencies of target nbody
[ 76%] Building SYCL object CMakeFiles/nbody.dir/single-kernel/nbody.cpp.o
[ 76%] Built target blocked_transform
Scanning dependencies of target dag_task_throughput_sequential
[ 77%] Building SYCL object CMakeFiles/dag_task_throughput_sequential.dir/runtime/dag_task_throughput_sequential.cpp.o
[ 78%] Linking SYCL executable arith
[ 78%] Built target arith
[ 80%] Linking SYCL executable local_mem
12 warnings generated.
[ 81%] Linking SYCL executable reduction
[ 82%] Linking SYCL executable vec_add
[ 82%] Built target local_mem
[ 82%] Built target reduction
[ 82%] Built target vec_add
12 warnings generated.
[ 84%] Linking SYCL executable scalar_prod
[ 84%] Built target scalar_prod
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
        [=](sycl::group<1> grp) {
        ^
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
[ 85%] Linking SYCL executable mol_dyn
[ 85%] Built target mol_dyn
[ 86%] Linking SYCL executable DRAM
[ 88%] Linking SYCL executable gesummv
[ 88%] Built target DRAM
[ 88%] Built target gesummv
[ 89%] Linking SYCL executable 2mm
[ 90%] Linking SYCL executable lin_reg_coeff
[ 90%] Built target 2mm
[ 90%] Built target lin_reg_coeff
[ 92%] Linking SYCL executable host_device_bandwidth
[ 92%] Built target host_device_bandwidth
[ 93%] Linking SYCL executable sf
[ 93%] Built target sf
[ 94%] Linking SYCL executable dag_task_throughput_sequential
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:231:11: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
          [=, dt = this->dt, gravitational_softening = this->gravitational_softening](sycl::group<1> grp) {
          ^
In file included from /workspace/codes/sycl-bench/single-kernel/nbody.cpp:1:
In file included from /workspace/codes/sycl-bench/include/common.h:2:
In file included from /opt/hipSYCL/bin/../include/CL/sycl.hpp:31:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/sycl.hpp:43:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/queue.hpp:41:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/handler.hpp:44:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/nd_item.hpp:36:
/opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/group.hpp:430:8: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
  void iterate_over_work_items(const range<1> iteration_range,
       ^
[ 94%] Built target dag_task_throughput_sequential
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:231:11: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
          [=, dt = this->dt, gravitational_softening = this->gravitational_softening](sycl::group<1> grp) {
          ^
15 warnings generated.
[ 96%] Linking SYCL executable segmentedreduction
[ 96%] Built target segmentedreduction
[ 97%] Linking SYCL executable pattern_L2
[ 97%] Built target pattern_L2
3 warnings generated.
[ 98%] Linking SYCL executable nbody
[ 98%] Built target nbody
make[1]: Target 'all' not remade because of errors.
Makefile:129: recipe for target 'all' failed
make: *** [all] Error 2
make: Target 'default_target' not remade because of errors.

Update the default symlink for the running suite to operate on.

In [6]:
! rm -r ./benchmarks
! ln -s ./hipsycl-rocm-benchmarks ./benchmarks

Run the benchmarks on CPU:

In [7]:
! ./run-suite gpu
Using test profile: cpu


##################################################
Processing host_device_bandwidth
##################################################
0.0 10800.0
__________________________________________________

host_device_bandwidth --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: 3D copy() is currently not supported on this platform
SYCL error: 3D copy() is currently not supported on this platform
==> Benchmark run finished in 473.6365746310039 s


##################################################
Processing sf
##################################################
0.0 10800.0
__________________________________________________

sf --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 0.05814825801644474 s


##################################################
Processing blocked_transform
##################################################
0.0 10800.0
__________________________________________________

blocked_transform --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
==> Benchmark run finished in 2980.474010874983 s


##################################################
Processing Makefile
##################################################
0.0 10800.0
__________________________________________________

Makefile --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
Traceback (most recent call last):
  File "./run-suite", line 249, in <module>
    retcode, elapsed_time = invoke_benchmark(benchmark_executable, args)
  File "./run-suite", line 147, in invoke_benchmark
    retcode = subprocess.call([benchmark_executable]+args)
  File "/usr/lib/python3.6/subprocess.py", line 287, in call
    with Popen(*popenargs, **kwargs) as p:
  File "/usr/lib/python3.6/subprocess.py", line 729, in __init__
    restore_signals, start_new_session)
  File "/usr/lib/python3.6/subprocess.py", line 1364, in _execute_child
    raise child_exception_type(errno_num, err_msg, err_filename)
PermissionError: [Errno 13] Permission denied: '/workspace/codes/sycl-bench/bin/hipsycl-cpu-kt-benchmarks/Makefile'

Rename the results to something more descriptive.

In [42]:
! tail -c +1 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-gfx906-hipsycl-rocm.csv
ROCm

perf is installed with:

apt install linux-tools-5.4.0-42-generic linux-tools-generic

proxy -- /etc/apt/apt.conf:

Acquire::http::Proxy "http://proxy.ftpn.ornl.gov:3128";
Acquire::https::Proxy "http://proxy.ftpn.ornl.gov:3128";

The environment may need to be reinstalled within a running docker instance, this is achieved with the following:

apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends curl gnupg && \
  curl -sL http://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | apt-key add - && \
  sh -c 'echo deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main > /etc/apt/sources.list.d/rocm.list' && \
  apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
  sudo \
  libelf1 \
  libnuma-dev \
  build-essential \
  git \
  vim-nox \
  cmake-curses-gui \
  kmod \
  file \
  rocm-dev

Compile with hipSYCL-rocm and remove non-applications from the final build.

In [5]:
! rm -r ./hipsycl-rocm-benchmarks
! mkdir ./hipsycl-rocm-benchmarks && cd ./hipsycl-rocm-benchmarks && cmake .. -DSYCL_IMPL=hipSYCL -DhipSYCL_DIR=/opt/hipSYCL/lib/cmake -DHIPSYCL_PLATFORM=rocm -DHIPSYCL_GPU_ARCH=gfx906 && make -j16 && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
-- The C compiler identification is Clang 9.0.1
-- The CXX compiler identification is Clang 9.0.1
-- Check for working C compiler: /llvm-9.0.1/bin/clang
-- Check for working C compiler: /llvm-9.0.1/bin/clang -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Configuring done
-- Generating done
-- Build files have been written to: /workspace/codes/sycl-bench/bin/hipsycl-cpu-kt-benchmarks
Scanning dependencies of target syrk
Scanning dependencies of target mvt
Scanning dependencies of target gramschmidt
Scanning dependencies of target gemm
Scanning dependencies of target covariance
Scanning dependencies of target fdtd2d
Scanning dependencies of target syr2k
Scanning dependencies of target 3mm
Scanning dependencies of target 3DConvolution
Scanning dependencies of target matmulchain
Scanning dependencies of target bicg
Scanning dependencies of target median
Scanning dependencies of target atax
Scanning dependencies of target kmeans
Scanning dependencies of target sobel5
Scanning dependencies of target scalar_prod
[  1%] Building SYCL object CMakeFiles/syrk.dir/polybench/syrk.cpp.o
[  2%] Building SYCL object CMakeFiles/mvt.dir/polybench/mvt.cpp.o
[  3%] Building SYCL object CMakeFiles/gemm.dir/polybench/gemm.cpp.o
[  5%] Building SYCL object CMakeFiles/covariance.dir/polybench/covariance.cpp.o
[  9%] Building SYCL object CMakeFiles/syr2k.dir/polybench/syr2k.cpp.o
[  9%] Building SYCL object CMakeFiles/gramschmidt.dir/polybench/gramschmidt.cpp.o
[  9%] Building SYCL object CMakeFiles/fdtd2d.dir/polybench/fdtd2d.cpp.o
[ 10%] Building SYCL object CMakeFiles/3mm.dir/polybench/3mm.cpp.o
[ 11%] Building SYCL object CMakeFiles/bicg.dir/polybench/bicg.cpp.o
[ 15%] Building SYCL object CMakeFiles/3DConvolution.dir/polybench/3DConvolution.cpp.o
[ 15%] Building SYCL object CMakeFiles/median.dir/single-kernel/median.cpp.o
[ 15%] Building SYCL object CMakeFiles/atax.dir/polybench/atax.cpp.o
[ 17%] Building SYCL object CMakeFiles/matmulchain.dir/runtime/matmulchain.cpp.o
[ 18%] Building SYCL object CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o
[ 19%] Building SYCL object CMakeFiles/scalar_prod.dir/single-kernel/scalar_prod.cpp.o
[ 21%] Building SYCL object CMakeFiles/kmeans.dir/single-kernel/kmeans.cpp.o
/workspace/codes/sycl-bench/single-kernel/median.cpp:70:54: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint xs = s::min(s::max(static_cast<s::cl_int>(x+j), static_cast<s::cl_int>(0)), static_cast<int>(size_-1)); // borders are handled here with extended values
                                                  ~~~^
/workspace/codes/sycl-bench/single-kernel/median.cpp:70:83: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint xs = s::min(s::max(static_cast<s::cl_int>(x+j), static_cast<s::cl_int>(0)), static_cast<int>(size_-1)); // borders are handled here with extended values
                                                                               ~~~^
/workspace/codes/sycl-bench/single-kernel/median.cpp:71:54: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint ys = s::min(s::max(static_cast<s::cl_int>(y+i), static_cast<s::cl_int>(0)), static_cast<int>(size_-1));
                                                  ~~~^
/workspace/codes/sycl-bench/single-kernel/median.cpp:71:83: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint ys = s::min(s::max(static_cast<s::cl_int>(y+i), static_cast<s::cl_int>(0)), static_cast<int>(size_-1));
                                                                               ~~~^
4 errors generated.
CMakeFiles/median.dir/build.make:62: recipe for target 'CMakeFiles/median.dir/single-kernel/median.cpp.o' failed
make[2]: *** [CMakeFiles/median.dir/single-kernel/median.cpp.o] Error 1
make[2]: Target 'CMakeFiles/median.dir/build' not remade because of errors.
CMakeFiles/Makefile2:479: recipe for target 'CMakeFiles/median.dir/all' failed
make[1]: *** [CMakeFiles/median.dir/all] Error 2
Scanning dependencies of target reduction
[ 22%] Building SYCL object CMakeFiles/reduction.dir/pattern/reduction.cpp.o
[ 23%] Linking SYCL executable syrk
[ 25%] Linking SYCL executable gemm
[ 25%] Built target syrk
Scanning dependencies of target DRAM
[ 26%] Building SYCL object CMakeFiles/DRAM.dir/micro/DRAM.cpp.o
[ 27%] Linking SYCL executable matmulchain
[ 28%] Linking SYCL executable syr2k
[ 30%] Linking SYCL executable 3DConvolution
[ 30%] Built target gemm
Scanning dependencies of target arith
[ 31%] Building SYCL object CMakeFiles/arith.dir/micro/arith.cpp.o
[ 31%] Built target matmulchain
[ 31%] Built target syr2k
Scanning dependencies of target local_mem
[ 32%] Building SYCL object CMakeFiles/local_mem.dir/micro/local_mem.cpp.o
Scanning dependencies of target correlation
[ 34%] Building SYCL object CMakeFiles/correlation.dir/polybench/correlation.cpp.o
[ 34%] Built target 3DConvolution
Scanning dependencies of target sobel
[ 35%] Building SYCL object CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o
[ 36%] Linking SYCL executable mvt
[ 38%] Linking SYCL executable sobel5
[ 39%] Linking SYCL executable gramschmidt
[ 40%] Linking SYCL executable bicg
[ 42%] Linking SYCL executable 3mm
[ 42%] Built target mvt
[ 43%] Linking SYCL executable atax
Scanning dependencies of target pattern_L2
[ 44%] Building SYCL object CMakeFiles/pattern_L2.dir/micro/pattern_L2.cpp.o
[ 44%] Built target sobel5
Scanning dependencies of target host_device_bandwidth
[ 46%] Building SYCL object CMakeFiles/host_device_bandwidth.dir/micro/host_device_bandwidth.cpp.o
[ 46%] Built target bicg
[ 46%] Built target gramschmidt
[ 46%] Built target 3mm
Scanning dependencies of target sobel7
Scanning dependencies of target 2DConvolution
Scanning dependencies of target vec_add
[ 48%] Building SYCL object CMakeFiles/2DConvolution.dir/polybench/2DConvolution.cpp.o
[ 48%] Building SYCL object CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o
[ 50%] Building SYCL object CMakeFiles/vec_add.dir/single-kernel/vec_add.cpp.o
[ 50%] Built target atax
Scanning dependencies of target dag_task_throughput_independent
[ 51%] Building SYCL object CMakeFiles/dag_task_throughput_independent.dir/runtime/dag_task_throughput_independent.cpp.o
[ 52%] Linking SYCL executable covariance
[ 53%] Linking SYCL executable fdtd2d
[ 53%] Built target covariance
Scanning dependencies of target blocked_transform
[ 55%] Building SYCL object CMakeFiles/blocked_transform.dir/runtime/blocked_transform.cpp.o
[ 55%] Built target fdtd2d
Scanning dependencies of target lin_reg_error
[ 56%] Building SYCL object CMakeFiles/lin_reg_error.dir/single-kernel/lin_reg_error.cpp.o
[ 57%] Linking SYCL executable kmeans
[ 57%] Built target kmeans
Scanning dependencies of target segmentedreduction
[ 59%] Building SYCL object CMakeFiles/segmentedreduction.dir/pattern/segmentedreduction.cpp.o
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
        [=](sycl::group<1> grp) {
        ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
              [=](cl::sycl::group<1> grp){
              ^
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
[ 60%] Linking SYCL executable 2DConvolution
[ 60%] Built target 2DConvolution
Scanning dependencies of target lin_reg_coeff
[ 61%] Building SYCL object CMakeFiles/lin_reg_coeff.dir/single-kernel/lin_reg_coeff.cpp.o
[ 63%] Linking SYCL executable sobel
[ 64%] Linking SYCL executable sobel7
[ 64%] Built target sobel
Scanning dependencies of target mol_dyn
[ 65%] Building SYCL object CMakeFiles/mol_dyn.dir/single-kernel/mol_dyn.cpp.o
[ 65%] Built target sobel7
Scanning dependencies of target gesummv
[ 67%] Building SYCL object CMakeFiles/gesummv.dir/polybench/gesummv.cpp.o
[ 68%] Linking SYCL executable correlation
[ 68%] Built target correlation
Scanning dependencies of target 2mm
[ 69%] Building SYCL object CMakeFiles/2mm.dir/polybench/2mm.cpp.o
[ 71%] Linking SYCL executable lin_reg_error
[ 72%] Linking SYCL executable dag_task_throughput_independent
[ 73%] Linking SYCL executable blocked_transform
[ 73%] Built target lin_reg_error
Scanning dependencies of target sf
[ 75%] Building SYCL object CMakeFiles/sf.dir/micro/sf.cpp.o
[ 75%] Built target dag_task_throughput_independent
Scanning dependencies of target nbody
[ 76%] Building SYCL object CMakeFiles/nbody.dir/single-kernel/nbody.cpp.o
[ 76%] Built target blocked_transform
Scanning dependencies of target dag_task_throughput_sequential
[ 77%] Building SYCL object CMakeFiles/dag_task_throughput_sequential.dir/runtime/dag_task_throughput_sequential.cpp.o
[ 78%] Linking SYCL executable arith
[ 78%] Built target arith
[ 80%] Linking SYCL executable local_mem
12 warnings generated.
[ 81%] Linking SYCL executable reduction
[ 82%] Linking SYCL executable vec_add
[ 82%] Built target local_mem
[ 82%] Built target reduction
[ 82%] Built target vec_add
12 warnings generated.
[ 84%] Linking SYCL executable scalar_prod
[ 84%] Built target scalar_prod
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
        [=](sycl::group<1> grp) {
        ^
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
[ 85%] Linking SYCL executable mol_dyn
[ 85%] Built target mol_dyn
[ 86%] Linking SYCL executable DRAM
[ 88%] Linking SYCL executable gesummv
[ 88%] Built target DRAM
[ 88%] Built target gesummv
[ 89%] Linking SYCL executable 2mm
[ 90%] Linking SYCL executable lin_reg_coeff
[ 90%] Built target 2mm
[ 90%] Built target lin_reg_coeff
[ 92%] Linking SYCL executable host_device_bandwidth
[ 92%] Built target host_device_bandwidth
[ 93%] Linking SYCL executable sf
[ 93%] Built target sf
[ 94%] Linking SYCL executable dag_task_throughput_sequential
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:231:11: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
          [=, dt = this->dt, gravitational_softening = this->gravitational_softening](sycl::group<1> grp) {
          ^
In file included from /workspace/codes/sycl-bench/single-kernel/nbody.cpp:1:
In file included from /workspace/codes/sycl-bench/include/common.h:2:
In file included from /opt/hipSYCL/bin/../include/CL/sycl.hpp:31:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/sycl.hpp:43:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/queue.hpp:41:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/handler.hpp:44:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/nd_item.hpp:36:
/opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/group.hpp:430:8: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
  void iterate_over_work_items(const range<1> iteration_range,
       ^
[ 94%] Built target dag_task_throughput_sequential
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:231:11: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
          [=, dt = this->dt, gravitational_softening = this->gravitational_softening](sycl::group<1> grp) {
          ^
15 warnings generated.
[ 96%] Linking SYCL executable segmentedreduction
[ 96%] Built target segmentedreduction
[ 97%] Linking SYCL executable pattern_L2
[ 97%] Built target pattern_L2
3 warnings generated.
[ 98%] Linking SYCL executable nbody
[ 98%] Built target nbody
make[1]: Target 'all' not remade because of errors.
Makefile:129: recipe for target 'all' failed
make: *** [all] Error 2
make: Target 'default_target' not remade because of errors.

Update the default symlink for the running suite to operate on.

In [6]:
! rm -r ./benchmarks
! ln -s ./hipsycl-rocm-benchmarks ./benchmarks

Run the benchmarks on CPU:

In [7]:
! ./run-suite gpu
Using test profile: cpu


##################################################
Processing host_device_bandwidth
##################################################
0.0 10800.0
__________________________________________________

host_device_bandwidth --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: 3D copy() is currently not supported on this platform
SYCL error: 3D copy() is currently not supported on this platform
==> Benchmark run finished in 473.6365746310039 s


##################################################
Processing sf
##################################################
0.0 10800.0
__________________________________________________

sf --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 0.05814825801644474 s


##################################################
Processing blocked_transform
##################################################
0.0 10800.0
__________________________________________________

blocked_transform --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
==> Benchmark run finished in 2980.474010874983 s


##################################################
Processing Makefile
##################################################
0.0 10800.0
__________________________________________________

Makefile --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
Traceback (most recent call last):
  File "./run-suite", line 249, in <module>
    retcode, elapsed_time = invoke_benchmark(benchmark_executable, args)
  File "./run-suite", line 147, in invoke_benchmark
    retcode = subprocess.call([benchmark_executable]+args)
  File "/usr/lib/python3.6/subprocess.py", line 287, in call
    with Popen(*popenargs, **kwargs) as p:
  File "/usr/lib/python3.6/subprocess.py", line 729, in __init__
    restore_signals, start_new_session)
  File "/usr/lib/python3.6/subprocess.py", line 1364, in _execute_child
    raise child_exception_type(errno_num, err_msg, err_filename)
PermissionError: [Errno 13] Permission denied: '/workspace/codes/sycl-bench/bin/hipsycl-cpu-kt-benchmarks/Makefile'

Rename the results to something more descriptive.

In [42]:
! tail -c +1 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-gfx906-hipsycl-rocm.csv
CUDA

Compile with hipSYCL-cuda and remove non-applications from the final build.

In [5]:
! rm -r ./hipsycl-cuda-benchmarks
! mkdir ./hipsycl-cuda-benchmarks && cd ./hipsycl-cuda-benchmarks && cmake .. -DSYCL_IMPL=hipSYCL -DhipSYCL_DIR=/opt/hipSYCL/lib/cmake -DHIPSYCL_PLATFORM=cuda -DHIPSYCL_GPU_ARCH=sm_60 && make -j16 && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
-- The C compiler identification is Clang 9.0.1
-- The CXX compiler identification is Clang 9.0.1
-- Check for working C compiler: /llvm-9.0.1/bin/clang
-- Check for working C compiler: /llvm-9.0.1/bin/clang -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Configuring done
-- Generating done
-- Build files have been written to: /workspace/codes/sycl-bench/bin/hipsycl-cpu-kt-benchmarks
Scanning dependencies of target syrk
Scanning dependencies of target mvt
Scanning dependencies of target gramschmidt
Scanning dependencies of target gemm
Scanning dependencies of target covariance
Scanning dependencies of target fdtd2d
Scanning dependencies of target syr2k
Scanning dependencies of target 3mm
Scanning dependencies of target 3DConvolution
Scanning dependencies of target matmulchain
Scanning dependencies of target bicg
Scanning dependencies of target median
Scanning dependencies of target atax
Scanning dependencies of target kmeans
Scanning dependencies of target sobel5
Scanning dependencies of target scalar_prod
[  1%] Building SYCL object CMakeFiles/syrk.dir/polybench/syrk.cpp.o
[  2%] Building SYCL object CMakeFiles/mvt.dir/polybench/mvt.cpp.o
[  3%] Building SYCL object CMakeFiles/gemm.dir/polybench/gemm.cpp.o
[  5%] Building SYCL object CMakeFiles/covariance.dir/polybench/covariance.cpp.o
[  9%] Building SYCL object CMakeFiles/syr2k.dir/polybench/syr2k.cpp.o
[  9%] Building SYCL object CMakeFiles/gramschmidt.dir/polybench/gramschmidt.cpp.o
[  9%] Building SYCL object CMakeFiles/fdtd2d.dir/polybench/fdtd2d.cpp.o
[ 10%] Building SYCL object CMakeFiles/3mm.dir/polybench/3mm.cpp.o
[ 11%] Building SYCL object CMakeFiles/bicg.dir/polybench/bicg.cpp.o
[ 15%] Building SYCL object CMakeFiles/3DConvolution.dir/polybench/3DConvolution.cpp.o
[ 15%] Building SYCL object CMakeFiles/median.dir/single-kernel/median.cpp.o
[ 15%] Building SYCL object CMakeFiles/atax.dir/polybench/atax.cpp.o
[ 17%] Building SYCL object CMakeFiles/matmulchain.dir/runtime/matmulchain.cpp.o
[ 18%] Building SYCL object CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o
[ 19%] Building SYCL object CMakeFiles/scalar_prod.dir/single-kernel/scalar_prod.cpp.o
[ 21%] Building SYCL object CMakeFiles/kmeans.dir/single-kernel/kmeans.cpp.o
/workspace/codes/sycl-bench/single-kernel/median.cpp:70:54: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint xs = s::min(s::max(static_cast<s::cl_int>(x+j), static_cast<s::cl_int>(0)), static_cast<int>(size_-1)); // borders are handled here with extended values
                                                  ~~~^
/workspace/codes/sycl-bench/single-kernel/median.cpp:70:83: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint xs = s::min(s::max(static_cast<s::cl_int>(x+j), static_cast<s::cl_int>(0)), static_cast<int>(size_-1)); // borders are handled here with extended values
                                                                               ~~~^
/workspace/codes/sycl-bench/single-kernel/median.cpp:71:54: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint ys = s::min(s::max(static_cast<s::cl_int>(y+i), static_cast<s::cl_int>(0)), static_cast<int>(size_-1));
                                                  ~~~^
/workspace/codes/sycl-bench/single-kernel/median.cpp:71:83: error: no type named 'cl_int' in namespace 'cl::sycl'
              uint ys = s::min(s::max(static_cast<s::cl_int>(y+i), static_cast<s::cl_int>(0)), static_cast<int>(size_-1));
                                                                               ~~~^
4 errors generated.
CMakeFiles/median.dir/build.make:62: recipe for target 'CMakeFiles/median.dir/single-kernel/median.cpp.o' failed
make[2]: *** [CMakeFiles/median.dir/single-kernel/median.cpp.o] Error 1
make[2]: Target 'CMakeFiles/median.dir/build' not remade because of errors.
CMakeFiles/Makefile2:479: recipe for target 'CMakeFiles/median.dir/all' failed
make[1]: *** [CMakeFiles/median.dir/all] Error 2
Scanning dependencies of target reduction
[ 22%] Building SYCL object CMakeFiles/reduction.dir/pattern/reduction.cpp.o
[ 23%] Linking SYCL executable syrk
[ 25%] Linking SYCL executable gemm
[ 25%] Built target syrk
Scanning dependencies of target DRAM
[ 26%] Building SYCL object CMakeFiles/DRAM.dir/micro/DRAM.cpp.o
[ 27%] Linking SYCL executable matmulchain
[ 28%] Linking SYCL executable syr2k
[ 30%] Linking SYCL executable 3DConvolution
[ 30%] Built target gemm
Scanning dependencies of target arith
[ 31%] Building SYCL object CMakeFiles/arith.dir/micro/arith.cpp.o
[ 31%] Built target matmulchain
[ 31%] Built target syr2k
Scanning dependencies of target local_mem
[ 32%] Building SYCL object CMakeFiles/local_mem.dir/micro/local_mem.cpp.o
Scanning dependencies of target correlation
[ 34%] Building SYCL object CMakeFiles/correlation.dir/polybench/correlation.cpp.o
[ 34%] Built target 3DConvolution
Scanning dependencies of target sobel
[ 35%] Building SYCL object CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o
[ 36%] Linking SYCL executable mvt
[ 38%] Linking SYCL executable sobel5
[ 39%] Linking SYCL executable gramschmidt
[ 40%] Linking SYCL executable bicg
[ 42%] Linking SYCL executable 3mm
[ 42%] Built target mvt
[ 43%] Linking SYCL executable atax
Scanning dependencies of target pattern_L2
[ 44%] Building SYCL object CMakeFiles/pattern_L2.dir/micro/pattern_L2.cpp.o
[ 44%] Built target sobel5
Scanning dependencies of target host_device_bandwidth
[ 46%] Building SYCL object CMakeFiles/host_device_bandwidth.dir/micro/host_device_bandwidth.cpp.o
[ 46%] Built target bicg
[ 46%] Built target gramschmidt
[ 46%] Built target 3mm
Scanning dependencies of target sobel7
Scanning dependencies of target 2DConvolution
Scanning dependencies of target vec_add
[ 48%] Building SYCL object CMakeFiles/2DConvolution.dir/polybench/2DConvolution.cpp.o
[ 48%] Building SYCL object CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o
[ 50%] Building SYCL object CMakeFiles/vec_add.dir/single-kernel/vec_add.cpp.o
[ 50%] Built target atax
Scanning dependencies of target dag_task_throughput_independent
[ 51%] Building SYCL object CMakeFiles/dag_task_throughput_independent.dir/runtime/dag_task_throughput_independent.cpp.o
[ 52%] Linking SYCL executable covariance
[ 53%] Linking SYCL executable fdtd2d
[ 53%] Built target covariance
Scanning dependencies of target blocked_transform
[ 55%] Building SYCL object CMakeFiles/blocked_transform.dir/runtime/blocked_transform.cpp.o
[ 55%] Built target fdtd2d
Scanning dependencies of target lin_reg_error
[ 56%] Building SYCL object CMakeFiles/lin_reg_error.dir/single-kernel/lin_reg_error.cpp.o
[ 57%] Linking SYCL executable kmeans
[ 57%] Built target kmeans
Scanning dependencies of target segmentedreduction
[ 59%] Building SYCL object CMakeFiles/segmentedreduction.dir/pattern/segmentedreduction.cpp.o
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
        [=](sycl::group<1> grp) {
        ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
              [=](cl::sycl::group<1> grp){
              ^
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/reduction.cpp:160:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/single-kernel/scalar_prod.cpp:138:15: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
[ 60%] Linking SYCL executable 2DConvolution
[ 60%] Built target 2DConvolution
Scanning dependencies of target lin_reg_coeff
[ 61%] Building SYCL object CMakeFiles/lin_reg_coeff.dir/single-kernel/lin_reg_coeff.cpp.o
[ 63%] Linking SYCL executable sobel
[ 64%] Linking SYCL executable sobel7
[ 64%] Built target sobel
Scanning dependencies of target mol_dyn
[ 65%] Building SYCL object CMakeFiles/mol_dyn.dir/single-kernel/mol_dyn.cpp.o
[ 65%] Built target sobel7
Scanning dependencies of target gesummv
[ 67%] Building SYCL object CMakeFiles/gesummv.dir/polybench/gesummv.cpp.o
[ 68%] Linking SYCL executable correlation
[ 68%] Built target correlation
Scanning dependencies of target 2mm
[ 69%] Building SYCL object CMakeFiles/2mm.dir/polybench/2mm.cpp.o
[ 71%] Linking SYCL executable lin_reg_error
[ 72%] Linking SYCL executable dag_task_throughput_independent
[ 73%] Linking SYCL executable blocked_transform
[ 73%] Built target lin_reg_error
Scanning dependencies of target sf
[ 75%] Building SYCL object CMakeFiles/sf.dir/micro/sf.cpp.o
[ 75%] Built target dag_task_throughput_independent
Scanning dependencies of target nbody
[ 76%] Building SYCL object CMakeFiles/nbody.dir/single-kernel/nbody.cpp.o
[ 76%] Built target blocked_transform
Scanning dependencies of target dag_task_throughput_sequential
[ 77%] Building SYCL object CMakeFiles/dag_task_throughput_sequential.dir/runtime/dag_task_throughput_sequential.cpp.o
[ 78%] Linking SYCL executable arith
[ 78%] Built target arith
[ 80%] Linking SYCL executable local_mem
12 warnings generated.
[ 81%] Linking SYCL executable reduction
[ 82%] Linking SYCL executable vec_add
[ 82%] Built target local_mem
[ 82%] Built target reduction
[ 82%] Built target vec_add
12 warnings generated.
[ 84%] Linking SYCL executable scalar_prod
[ 84%] Built target scalar_prod
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
        [=](sycl::group<1> grp) {
        ^
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
/workspace/codes/sycl-bench/pattern/segmentedreduction.cpp:93:9: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
[ 85%] Linking SYCL executable mol_dyn
[ 85%] Built target mol_dyn
[ 86%] Linking SYCL executable DRAM
[ 88%] Linking SYCL executable gesummv
[ 88%] Built target DRAM
[ 88%] Built target gesummv
[ 89%] Linking SYCL executable 2mm
[ 90%] Linking SYCL executable lin_reg_coeff
[ 90%] Built target 2mm
[ 90%] Built target lin_reg_coeff
[ 92%] Linking SYCL executable host_device_bandwidth
[ 92%] Built target host_device_bandwidth
[ 93%] Linking SYCL executable sf
[ 93%] Built target sf
[ 94%] Linking SYCL executable dag_task_throughput_sequential
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:231:11: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
          [=, dt = this->dt, gravitational_softening = this->gravitational_softening](sycl::group<1> grp) {
          ^
In file included from /workspace/codes/sycl-bench/single-kernel/nbody.cpp:1:
In file included from /workspace/codes/sycl-bench/include/common.h:2:
In file included from /opt/hipSYCL/bin/../include/CL/sycl.hpp:31:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/sycl.hpp:43:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/queue.hpp:41:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/handler.hpp:44:
In file included from /opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/nd_item.hpp:36:
/opt/hipSYCL/bin/../include/CL/../hipSYCL/sycl/group.hpp:430:8: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
  void iterate_over_work_items(const range<1> iteration_range,
       ^
[ 94%] Built target dag_task_throughput_sequential
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:231:11: warning: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
          [=, dt = this->dt, gravitational_softening = this->gravitational_softening](sycl::group<1> grp) {
          ^
15 warnings generated.
[ 96%] Linking SYCL executable segmentedreduction
[ 96%] Built target segmentedreduction
[ 97%] Linking SYCL executable pattern_L2
[ 97%] Built target pattern_L2
3 warnings generated.
[ 98%] Linking SYCL executable nbody
[ 98%] Built target nbody
make[1]: Target 'all' not remade because of errors.
Makefile:129: recipe for target 'all' failed
make: *** [all] Error 2
make: Target 'default_target' not remade because of errors.

Update the default symlink for the running suite to operate on.

In [6]:
! rm -r ./benchmarks
! ln -s ./hipsycl-cua-benchmarks ./benchmarks

Run the benchmarks on GPU:

In [7]:
! ./run-suite gpu
Using test profile: cpu


##################################################
Processing host_device_bandwidth
##################################################
0.0 10800.0
__________________________________________________

host_device_bandwidth --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
SYCL error: 3D copy() is currently not supported on this platform
SYCL error: 3D copy() is currently not supported on this platform
==> Benchmark run finished in 473.6365746310039 s


##################################################
Processing sf
##################################################
0.0 10800.0
__________________________________________________

sf --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
==> Benchmark run finished in 0.05814825801644474 s


##################################################
Processing blocked_transform
##################################################
0.0 10800.0
__________________________________________________

blocked_transform --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1048576 --local=256
==> Benchmark run finished in 2980.474010874983 s


##################################################
Processing Makefile
##################################################
0.0 10800.0
__________________________________________________

Makefile --num-runs=50 --output=./sycl-bench.csv --device=cpu --size=1024 --local=256
Traceback (most recent call last):
  File "./run-suite", line 249, in <module>
    retcode, elapsed_time = invoke_benchmark(benchmark_executable, args)
  File "./run-suite", line 147, in invoke_benchmark
    retcode = subprocess.call([benchmark_executable]+args)
  File "/usr/lib/python3.6/subprocess.py", line 287, in call
    with Popen(*popenargs, **kwargs) as p:
  File "/usr/lib/python3.6/subprocess.py", line 729, in __init__
    restore_signals, start_new_session)
  File "/usr/lib/python3.6/subprocess.py", line 1364, in _execute_child
    raise child_exception_type(errno_num, err_msg, err_filename)
PermissionError: [Errno 13] Permission denied: '/workspace/codes/sycl-bench/bin/hipsycl-cpu-kt-benchmarks/Makefile'

Rename the results to something more descriptive.

In [42]:
! tail -c +1 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-p100-hipsycl-cuda.csv

triSYCL

Note: Vendor OpenCL ICD's need to be hidden from view for triSYCL to default to the OpenMP/TBB implementation.

CPU

Compile with triSYCL-CPU and remove non-applications from the final build.

In [13]:
! rm -r ./trisycl-cpu-benchmarks
! mkdir ./trisycl-cpu-benchmarks && cd ./trisycl-cpu-benchmarks && cmake ../.. -DSYCL_IMPL=triSYCL -DTRISYCL_TBB=ON -DTRISYCL_INCLUDE_DIR=/tmp/triSYCL-master/include && make -j16 --keep-going
! cd ./trisycl-cpu-benchmarks && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
-- The C compiler identification is Clang 9.0.1
-- The CXX compiler identification is Clang 9.0.1
-- Check for working C compiler: /llvm-9.0.1/bin/clang
-- Check for working C compiler: /llvm-9.0.1/bin/clang -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- host compiler - clang 9.0.1
-- Found triSYCL include directory: /tmp/triSYCL-master/include
-- Found OpenMP_C: -fopenmp=libomp (found version "3.1") 
-- Found OpenMP_CXX: -fopenmp=libomp (found version "3.1") 
-- Found OpenMP: TRUE (found version "3.1")  
-- Looking for pthread.h
-- Looking for pthread.h - found
-- Looking for pthread_create
-- Looking for pthread_create - not found
-- Looking for pthread_create in pthreads
-- Looking for pthread_create in pthreads - not found
-- Looking for pthread_create in pthread
-- Looking for pthread_create in pthread - found
-- Found Threads: TRUE  
-- Boost version: 1.65.1
-- Found the following Boost libraries:
--   chrono
--   log
--   system
--   date_time
--   log_setup
--   filesystem
--   thread
--   regex
--   atomic
-- triSYCL OpenMP:                   ON
-- triSYCL TBB:                      ON
-- triSYCL OpenCL:                   OFF
-- triSYCL synchronous execution:    OFF
-- triSYCL debug mode:               OFF
-- triSYCL object trace:             OFF
-- triSYCL kernel trace:             OFF
-- Configuring done
-- Generating done
-- Build files have been written to: /workspace/codes/sycl-bench/bin/trisycl-cpu-benchmarks
Scanning dependencies of target syrk
Scanning dependencies of target mvt
Scanning dependencies of target gemm
Scanning dependencies of target gramschmidt
Scanning dependencies of target fdtd2d
Scanning dependencies of target covariance
Scanning dependencies of target syr2k
Scanning dependencies of target bicg
Scanning dependencies of target 3DConvolution
Scanning dependencies of target median
Scanning dependencies of target matmulchain
Scanning dependencies of target 3mm
Scanning dependencies of target correlation
Scanning dependencies of target local_mem
Scanning dependencies of target arith
Scanning dependencies of target sobel
[  1%] Building CXX object CMakeFiles/mvt.dir/polybench/mvt.cpp.o
[  2%] Building CXX object CMakeFiles/syr2k.dir/polybench/syr2k.cpp.o
[  3%] Building CXX object CMakeFiles/covariance.dir/polybench/covariance.cpp.o
[  5%] Building CXX object CMakeFiles/gramschmidt.dir/polybench/gramschmidt.cpp.o
[  6%] Building CXX object CMakeFiles/gemm.dir/polybench/gemm.cpp.o
[  9%] Building CXX object CMakeFiles/local_mem.dir/micro/local_mem.cpp.o
[  9%] Building CXX object CMakeFiles/median.dir/single-kernel/median.cpp.o
[ 10%] Building CXX object CMakeFiles/fdtd2d.dir/polybench/fdtd2d.cpp.o
[ 11%] Building CXX object CMakeFiles/correlation.dir/polybench/correlation.cpp.o
[ 13%] Building CXX object CMakeFiles/matmulchain.dir/runtime/matmulchain.cpp.o
[ 14%] Building CXX object CMakeFiles/3mm.dir/polybench/3mm.cpp.o
[ 15%] Building CXX object CMakeFiles/syrk.dir/polybench/syrk.cpp.o
[ 17%] Building CXX object CMakeFiles/bicg.dir/polybench/bicg.cpp.o
[ 18%] Building CXX object CMakeFiles/arith.dir/micro/arith.cpp.o
[ 19%] Building CXX object CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o
[ 21%] Building CXX object CMakeFiles/3DConvolution.dir/polybench/3DConvolution.cpp.o
In file included from /workspace/codes/sycl-bench/single-kernel/sobel.cpp:5:
/workspace/codes/sycl-bench/include/bitmap.h:268:31: warning: missing field 'creator1' initializer [-Wmissing-field-initializers]
                bmpfile_header header = { 0 };
                                            ^
/workspace/codes/sycl-bench/include/bitmap.h:274:35: warning: missing field 'width' initializer [-Wmissing-field-initializers]
                bmpfile_dib_info dib_info = { 0 };
                                                ^
In file included from /workspace/codes/sycl-bench/single-kernel/median.cpp:5:
/workspace/codes/sycl-bench/include/bitmap.h:268:31: warning: missing field 'creator1' initializer [-Wmissing-field-initializers]
                bmpfile_header header = { 0 };
                                            ^
/workspace/codes/sycl-bench/include/bitmap.h:274:35: warning: missing field 'width' initializer [-Wmissing-field-initializers]
                bmpfile_dib_info dib_info = { 0 };
                                                ^
/workspace/codes/sycl-bench/polybench/fdtd2d.cpp:108:57: warning: lambda capture 'NX_' is not used [-Wunused-lambda-capture]
                                cgh.parallel_for<Fdtd2d2>(range<2>(size, size), [=, NX_ = size, NY_ = size](item<2> item) {
                                                                                  ~~^~~~~~~~~~
/workspace/codes/sycl-bench/polybench/fdtd2d.cpp:108:69: warning: lambda capture 'NY_' is not used [-Wunused-lambda-capture]
                                cgh.parallel_for<Fdtd2d2>(range<2>(size, size), [=, NX_ = size, NY_ = size](item<2> item) {
                                                                                              ~~^~~~~~~~~~
In file included from /workspace/codes/sycl-bench/polybench/gemm.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/polybench/syr2k.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/single-kernel/sobel.cpp:1:
In file included from /tmp/triSYCL-master/include/CL/sycl.hpp:10:
In file included from /tmp/triSYCL-master/include/triSYCL/sycl.hpp:66:
/tmp/triSYCL-master/include/triSYCL/math.hpp:95:1: error: no matching function for call to 'hypot'
TRISYCL_MATH_WRAP2(hypot)
^~~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/math.hpp:34:12: note: expanded from macro 'TRISYCL_MATH_WRAP2'
    return std::FUN(x, y);                                                     \
           ^~~~~~~~
/workspace/codes/sycl-bench/single-kernel/sobel.cpp:87:34: note: in instantiation of function template specialization 'trisycl::hypot<trisycl::vec<float, 4> >' requested here
        cl::sycl::float4 color = hypot(Gx, Gy);
                                 ^
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:147:13: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'double' for 1st argument
__MATHCALL (hypot,, (_Mdouble_ __x, _Mdouble_ __y));
            ^
/usr/include/math.h:273:25: note: expanded from macro '__MATHCALL'
  __MATHDECL (_Mdouble_,function,suffix, args)
                        ^
/usr/include/math.h:275:22: note: expanded from macro '__MATHDECL'
  __MATHDECL_1(type, function,suffix, args); \
                     ^
/usr/include/math.h:283:31: note: expanded from macro '__MATHDECL_1'
  extern type __MATH_PRECNAME(function,suffix) args __THROW
                              ^
/usr/include/math.h:286:42: note: expanded from macro '__MATH_PRECNAME'
#define __MATH_PRECNAME(name,r) __CONCAT(name,r)
                                         ^
/usr/include/x86_64-linux-gnu/sys/cdefs.h:100:23: note: expanded from macro '__CONCAT'
#define __CONCAT(x,y)   x ## y
                        ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1473:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'float' for 1st argument
  hypot(float __x, float __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1477:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'long double' for 1st argument
  hypot(long double __x, long double __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1484:5: note: candidate template ignored: substitution failure [with _Tp = trisycl::vec<float, 4>, _Up = trisycl::vec<float, 4>]: no type named '__type' in '__gnu_cxx::__promote<trisycl::vec<float, 4>, false>'
    hypot(_Tp __x, _Up __y)
    ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1903:5: note: candidate function template not viable: requires 3 arguments, but 2 were provided
    hypot(_Tp __x, _Up __y, _Vp __z)
    ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1890:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(float __x, float __y, float __z)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1894:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(double __x, double __y, double __z)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1898:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(long double __x, long double __y, long double __z)
  ^
In file included from /workspace/codes/sycl-bench/single-kernel/median.cpp:1:
In file included from /tmp/triSYCL-master/include/CL/sycl.hpp:10:
In file included from /tmp/triSYCL-master/include/triSYCL/sycl.hpp:66:
/tmp/triSYCL-master/include/triSYCL/math.hpp:83:1: error: no matching function for call to 'fdim'
TRISYCL_MATH_WRAP2(fdim)
^~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/math.hpp:34:12: note: expanded from macro 'TRISYCL_MATH_WRAP2'
    return std::FUN(x, y);                                                     \
           ^~~~~~~~
/workspace/codes/sycl-bench/single-kernel/median.cpp:173:30: note: in instantiation of function template specialization 'trisycl::fdim<trisycl::vec<float, 4> >' requested here
      cl::sycl::float4 dif = fdim(output_acc.get_pointer()[i], expected);
                             ^
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:326:13: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'double' for 1st argument
__MATHCALL (fdim,, (_Mdouble_ __x, _Mdouble_ __y));
            ^
/usr/include/math.h:273:25: note: expanded from macro '__MATHCALL'
  __MATHDECL (_Mdouble_,function,suffix, args)
                        ^
/usr/include/math.h:275:22: note: expanded from macro '__MATHDECL'
  __MATHDECL_1(type, function,suffix, args); \
                     ^
/usr/include/math.h:283:31: note: expanded from macro '__MATHDECL_1'
  extern type __MATH_PRECNAME(function,suffix) args __THROW
                              ^
/usr/include/math.h:286:42: note: expanded from macro '__MATH_PRECNAME'
#define __MATH_PRECNAME(name,r) __CONCAT(name,r)
                                         ^
/usr/include/x86_64-linux-gnu/sys/cdefs.h:100:23: note: expanded from macro '__CONCAT'
#define __CONCAT(x,y)   x ## y
                        ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1393:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'float' for 1st argument
  fdim(float __x, float __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1397:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'long double' for 1st argument
  fdim(long double __x, long double __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1404:5: note: candidate template ignored: substitution failure [with _Tp = trisycl::vec<float, 4>, _Up = trisycl::vec<float, 4>]: no type named '__type' in '__gnu_cxx::__promote<trisycl::vec<float, 4>, false>'
    fdim(_Tp __x, _Up __y)
    ^
In file included from /workspace/codes/sycl-bench/polybench/mvt.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/polybench/gramschmidt.cpp:10:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/single-kernel/sobel.cpp:1:
In file included from /tmp/triSYCL-master/include/CL/sycl.hpp:10:
In file included from /tmp/triSYCL-master/include/triSYCL/sycl.hpp:66:
/tmp/triSYCL-master/include/triSYCL/math.hpp:83:1: error: no matching function for call to 'fdim'
TRISYCL_MATH_WRAP2(fdim)
^~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/math.hpp:34:12: note: expanded from macro 'TRISYCL_MATH_WRAP2'
    return std::FUN(x, y);                                                     \
           ^~~~~~~~
/workspace/codes/sycl-bench/single-kernel/sobel.cpp:131:30: note: in instantiation of function template specialization 'trisycl::fdim<trisycl::vec<float, 4> >' requested here
      cl::sycl::float4 dif = fdim(output[i], expected);
                             ^
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:326:13: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'double' for 1st argument
__MATHCALL (fdim,, (_Mdouble_ __x, _Mdouble_ __y));
            ^
/usr/include/math.h:273:25: note: expanded from macro '__MATHCALL'
  __MATHDECL (_Mdouble_,function,suffix, args)
                        ^
/usr/include/math.h:275:22: note: expanded from macro '__MATHDECL'
  __MATHDECL_1(type, function,suffix, args); \
                     ^
/usr/include/math.h:283:31: note: expanded from macro '__MATHDECL_1'
  extern type __MATH_PRECNAME(function,suffix) args __THROW
                              ^
/usr/include/math.h:286:42: note: expanded from macro '__MATH_PRECNAME'
#define __MATH_PRECNAME(name,r) __CONCAT(name,r)
                                         ^
/usr/include/x86_64-linux-gnu/sys/cdefs.h:100:23: note: expanded from macro '__CONCAT'
#define __CONCAT(x,y)   x ## y
                        ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1393:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'float' for 1st argument
  fdim(float __x, float __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1397:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'long double' for 1st argument
  fdim(long double __x, long double __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1404:5: note: candidate template ignored: substitution failure [with _Tp = trisycl::vec<float, 4>, _Up = trisycl::vec<float, 4>]: no type named '__type' in '__gnu_cxx::__promote<trisycl::vec<float, 4>, false>'
    fdim(_Tp __x, _Up __y)
    ^
In file included from /workspace/codes/sycl-bench/polybench/syrk.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/polybench/bicg.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
2 warnings and 1 error generated.
In file included from /workspace/codes/sycl-bench/polybench/covariance.cpp:10:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
2 warnings and 2 errors generated.
CMakeFiles/median.dir/build.make:62: recipe for target 'CMakeFiles/median.dir/single-kernel/median.cpp.o' failed
make[2]: *** [CMakeFiles/median.dir/single-kernel/median.cpp.o] Error 1
make[2]: Target 'CMakeFiles/median.dir/build' not remade because of errors.
CMakeFiles/Makefile2:479: recipe for target 'CMakeFiles/median.dir/all' failed
make[1]: *** [CMakeFiles/median.dir/all] Error 2
Scanning dependencies of target pattern_L2
CMakeFiles/sobel.dir/build.make:62: recipe for target 'CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o' failed
make[2]: *** [CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o] Error 1
make[2]: Target 'CMakeFiles/sobel.dir/build' not remade because of errors.
CMakeFiles/Makefile2:627: recipe for target 'CMakeFiles/sobel.dir/all' failed
make[1]: *** [CMakeFiles/sobel.dir/all] Error 2
Scanning dependencies of target blocked_transform
[ 22%] Building CXX object CMakeFiles/pattern_L2.dir/micro/pattern_L2.cpp.o
In file included from /workspace/codes/sycl-bench/polybench/3DConvolution.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/polybench/fdtd2d.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
[ 23%] Building CXX object CMakeFiles/blocked_transform.dir/runtime/blocked_transform.cpp.o
In file included from /workspace/codes/sycl-bench/polybench/correlation.cpp:10:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/polybench/3mm.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
/workspace/codes/sycl-bench/runtime/blocked_transform.cpp:81:25: error: no matching member function for call to 'get_access'
        auto acc = buff.get_access<sycl::access::mode::read_write>(
                   ~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 3 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 3 were provided
  get_access() {
  ^
1 error generated.
CMakeFiles/blocked_transform.dir/build.make:62: recipe for target 'CMakeFiles/blocked_transform.dir/runtime/blocked_transform.cpp.o' failed
make[2]: *** [CMakeFiles/blocked_transform.dir/runtime/blocked_transform.cpp.o] Error 1
make[2]: Target 'CMakeFiles/blocked_transform.dir/build' not remade because of errors.
CMakeFiles/Makefile2:701: recipe for target 'CMakeFiles/blocked_transform.dir/all' failed
make[1]: *** [CMakeFiles/blocked_transform.dir/all] Error 2
Scanning dependencies of target dag_task_throughput_independent
[ 25%] Building CXX object CMakeFiles/dag_task_throughput_independent.dir/runtime/dag_task_throughput_independent.cpp.o
1 warning generated.
[ 26%] Linking CXX executable syr2k
1 warning generated.
[ 27%] Linking CXX executable matmulchain
[ 28%] Linking CXX executable gemm
1 warning generated.
[ 30%] Linking CXX executable syrk
[ 30%] Built target syr2k
Scanning dependencies of target kmeans
[ 30%] Built target matmulchain
Scanning dependencies of target reduction
[ 31%] Building CXX object CMakeFiles/kmeans.dir/single-kernel/kmeans.cpp.o
[ 32%] Building CXX object CMakeFiles/reduction.dir/pattern/reduction.cpp.o
[ 32%] Built target gemm
Scanning dependencies of target DRAM
[ 34%] Building CXX object CMakeFiles/DRAM.dir/micro/DRAM.cpp.o
[ 34%] Built target syrk
Scanning dependencies of target atax
1 warning generated.
[ 35%] Building CXX object CMakeFiles/atax.dir/polybench/atax.cpp.o
[ 36%] Linking CXX executable 3DConvolution
[ 36%] Built target 3DConvolution
Scanning dependencies of target scalar_prod
[ 38%] Building CXX object CMakeFiles/scalar_prod.dir/single-kernel/scalar_prod.cpp.o
1 warning generated.
1 warning generated.
[ 39%] Linking CXX executable mvt
[ 40%] Linking CXX executable bicg
1 warning generated.
[ 42%] Linking CXX executable gramschmidt
[ 42%] Built target mvt
Scanning dependencies of target sobel5
[ 42%] Built target bicg
[ 43%] Building CXX object CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o
Scanning dependencies of target host_device_bandwidth
[ 44%] Building CXX object CMakeFiles/host_device_bandwidth.dir/micro/host_device_bandwidth.cpp.o
[ 44%] Built target gramschmidt
Scanning dependencies of target sobel7
[ 46%] Building CXX object CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o
3 warnings generated.
[ 47%] Linking CXX executable fdtd2d
[ 47%] Built target fdtd2d
1 warning generated.
Scanning dependencies of target 2DConvolution
[ 48%] Linking CXX executable covariance
[ 50%] Building CXX object CMakeFiles/2DConvolution.dir/polybench/2DConvolution.cpp.o
[ 50%] Built target covariance
Scanning dependencies of target vec_add
[ 51%] Building CXX object CMakeFiles/vec_add.dir/single-kernel/vec_add.cpp.o
1 warning generated.
[ 52%] Linking CXX executable 3mm
[ 52%] Built target 3mm
Scanning dependencies of target lin_reg_error
1 warning generated.
[ 53%] Building CXX object CMakeFiles/lin_reg_error.dir/single-kernel/lin_reg_error.cpp.o
[ 55%] Linking CXX executable correlation
[ 55%] Built target correlation
Scanning dependencies of target segmentedreduction
[ 56%] Building CXX object CMakeFiles/segmentedreduction.dir/pattern/segmentedreduction.cpp.o
In file included from /workspace/codes/sycl-bench/single-kernel/sobel5.cpp:5:
/workspace/codes/sycl-bench/include/bitmap.h:268:31: warning: missing field 'creator1' initializer [-Wmissing-field-initializers]
                bmpfile_header header = { 0 };
                                            ^
/workspace/codes/sycl-bench/include/bitmap.h:274:35: warning: missing field 'width' initializer [-Wmissing-field-initializers]
                bmpfile_dib_info dib_info = { 0 };
                                                ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:154:15: error: no member named 'copy' in 'trisycl::handler'
          cgh.copy(host_data.data(), acc);
          ~~~ ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:162:15: error: no member named 'copy' in 'trisycl::handler'
          cgh.copy(acc, host_data.data());
          ~~~ ^
In file included from /workspace/codes/sycl-bench/single-kernel/sobel7.cpp:5:
/workspace/codes/sycl-bench/include/bitmap.h:268:31: warning: missing field 'creator1' initializer [-Wmissing-field-initializers]
                bmpfile_header header = { 0 };
                                            ^
/workspace/codes/sycl-bench/include/bitmap.h:274:35: warning: missing field 'width' initializer [-Wmissing-field-initializers]
                bmpfile_dib_info dib_info = { 0 };
                                                ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:63:45: error: no matching member function for call to 'get_access'
    T result = _final_output_buff->template get_access<sycl::access::mode::read>(
               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:118:21: note: in instantiation of member function 'Reduction<int>::verify' requested here
              if(!b.verify(args.verification)) {
                    ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<ReductionNDRange<int> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:242:9: note: in instantiation of function template specialization 'BenchmarkApp::run<ReductionNDRange<int>>' requested here
    app.run< ReductionNDRange<int>>();
        ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 2 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 2 were provided
  get_access() {
  ^
In file included from /workspace/codes/sycl-bench/polybench/atax.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:63:45: error: no matching member function for call to 'get_access'
    T result = _final_output_buff->template get_access<sycl::access::mode::read>(
               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:118:21: note: in instantiation of member function 'Reduction<long long>::verify' requested here
              if(!b.verify(args.verification)) {
                    ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<ReductionNDRange<long long> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:243:9: note: in instantiation of function template specialization 'BenchmarkApp::run<ReductionNDRange<long long>>' requested here
    app.run< ReductionNDRange<long long>>();
        ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 2 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 2 were provided
  get_access() {
  ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:63:45: error: no matching member function for call to 'get_access'
    T result = _final_output_buff->template get_access<sycl::access::mode::read>(
               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:118:21: note: in instantiation of member function 'Reduction<float>::verify' requested here
              if(!b.verify(args.verification)) {
                    ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<ReductionNDRange<float> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:244:9: note: in instantiation of function template specialization 'BenchmarkApp::run<ReductionNDRange<float>>' requested here
    app.run< ReductionNDRange<float>>();
        ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 2 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 2 were provided
  get_access() {
  ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:63:45: error: no matching member function for call to 'get_access'
    T result = _final_output_buff->template get_access<sycl::access::mode::read>(
               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:118:21: note: in instantiation of member function 'Reduction<double>::verify' requested here
              if(!b.verify(args.verification)) {
                    ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<ReductionNDRange<double> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:245:9: note: in instantiation of function template specialization 'BenchmarkApp::run<ReductionNDRange<double>>' requested here
    app.run< ReductionNDRange<double>>();
        ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 2 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 2 were provided
  get_access() {
  ^
4 errors generated.
CMakeFiles/reduction.dir/build.make:62: recipe for target 'CMakeFiles/reduction.dir/pattern/reduction.cpp.o' failed
make[2]: *** [CMakeFiles/reduction.dir/pattern/reduction.cpp.o] Error 1
make[2]: Target 'CMakeFiles/reduction.dir/build' not remade because of errors.
CMakeFiles/Makefile2:812: recipe for target 'CMakeFiles/reduction.dir/all' failed
make[1]: *** [CMakeFiles/reduction.dir/all] Error 2
[ 57%] Linking CXX executable arith
Scanning dependencies of target lin_reg_coeff
[ 59%] Building CXX object CMakeFiles/lin_reg_coeff.dir/single-kernel/lin_reg_coeff.cpp.o
[ 60%] Linking CXX executable local_mem
In file included from /workspace/codes/sycl-bench/single-kernel/sobel5.cpp:1:
In file included from /tmp/triSYCL-master/include/CL/sycl.hpp:10:
In file included from /tmp/triSYCL-master/include/triSYCL/sycl.hpp:66:
/tmp/triSYCL-master/include/triSYCL/math.hpp:95:1: error: no matching function for call to 'hypot'
TRISYCL_MATH_WRAP2(hypot)
^~~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/math.hpp:34:12: note: expanded from macro 'TRISYCL_MATH_WRAP2'
    return std::FUN(x, y);                                                     \
           ^~~~~~~~
/workspace/codes/sycl-bench/single-kernel/sobel5.cpp:98:36: note: in instantiation of function template specialization 'trisycl::hypot<trisycl::vec<float, 4> >' requested here
          cl::sycl::float4 color = hypot(Gx, Gy);
                                   ^
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:147:13: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'double' for 1st argument
__MATHCALL (hypot,, (_Mdouble_ __x, _Mdouble_ __y));
            ^
/usr/include/math.h:273:25: note: expanded from macro '__MATHCALL'
  __MATHDECL (_Mdouble_,function,suffix, args)
                        ^
/usr/include/math.h:275:22: note: expanded from macro '__MATHDECL'
  __MATHDECL_1(type, function,suffix, args); \
                     ^
/usr/include/math.h:283:31: note: expanded from macro '__MATHDECL_1'
  extern type __MATH_PRECNAME(function,suffix) args __THROW
                              ^
/usr/include/math.h:286:42: note: expanded from macro '__MATH_PRECNAME'
#define __MATH_PRECNAME(name,r) __CONCAT(name,r)
                                         ^
/usr/include/x86_64-linux-gnu/sys/cdefs.h:100:23: note: expanded from macro '__CONCAT'
#define __CONCAT(x,y)   x ## y
                        ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1473:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'float' for 1st argument
  hypot(float __x, float __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1477:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'long double' for 1st argument
  hypot(long double __x, long double __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1484:5: note: candidate template ignored: substitution failure [with _Tp = trisycl::vec<float, 4>, _Up = trisycl::vec<float, 4>]: no type named '__type' in '__gnu_cxx::__promote<trisycl::vec<float, 4>, false>'
    hypot(_Tp __x, _Up __y)
    ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1903:5: note: candidate function template not viable: requires 3 arguments, but 2 were provided
    hypot(_Tp __x, _Up __y, _Vp __z)
    ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1890:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(float __x, float __y, float __z)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1894:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(double __x, double __y, double __z)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1898:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(long double __x, long double __y, long double __z)
  ^
In file included from /workspace/codes/sycl-bench/single-kernel/sobel5.cpp:1:
In file included from /tmp/triSYCL-master/include/CL/sycl.hpp:10:
In file included from /tmp/triSYCL-master/include/triSYCL/sycl.hpp:66:
/tmp/triSYCL-master/include/triSYCL/math.hpp:83:1: error: no matching function for call to 'fdim'
TRISYCL_MATH_WRAP2(fdim)
^~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/math.hpp:34:12: note: expanded from macro 'TRISYCL_MATH_WRAP2'
    return std::FUN(x, y);                                                     \
           ^~~~~~~~
/workspace/codes/sycl-bench/single-kernel/sobel5.cpp:140:32: note: in instantiation of function template specialization 'trisycl::fdim<trisycl::vec<float, 4> >' requested here
        cl::sycl::float4 dif = fdim(output[i], expected);
                               ^
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:326:13: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'double' for 1st argument
__MATHCALL (fdim,, (_Mdouble_ __x, _Mdouble_ __y));
            ^
/usr/include/math.h:273:25: note: expanded from macro '__MATHCALL'
  __MATHDECL (_Mdouble_,function,suffix, args)
                        ^
/usr/include/math.h:275:22: note: expanded from macro '__MATHDECL'
  __MATHDECL_1(type, function,suffix, args); \
                     ^
/usr/include/math.h:283:31: note: expanded from macro '__MATHDECL_1'
  extern type __MATH_PRECNAME(function,suffix) args __THROW
                              ^
/usr/include/math.h:286:42: note: expanded from macro '__MATH_PRECNAME'
#define __MATH_PRECNAME(name,r) __CONCAT(name,r)
                                         ^
/usr/include/x86_64-linux-gnu/sys/cdefs.h:100:23: note: expanded from macro '__CONCAT'
#define __CONCAT(x,y)   x ## y
                        ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1393:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'float' for 1st argument
  fdim(float __x, float __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1397:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'long double' for 1st argument
  fdim(long double __x, long double __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1404:5: note: candidate template ignored: substitution failure [with _Tp = trisycl::vec<float, 4>, _Up = trisycl::vec<float, 4>]: no type named '__type' in '__gnu_cxx::__promote<trisycl::vec<float, 4>, false>'
    fdim(_Tp __x, _Up __y)
    ^
In file included from /workspace/codes/sycl-bench/single-kernel/sobel7.cpp:1:
In file included from /tmp/triSYCL-master/include/CL/sycl.hpp:10:
In file included from /tmp/triSYCL-master/include/triSYCL/sycl.hpp:66:
/tmp/triSYCL-master/include/triSYCL/math.hpp:95:1: error: no matching function for call to 'hypot'
TRISYCL_MATH_WRAP2(hypot)
^~~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/math.hpp:34:12: note: expanded from macro 'TRISYCL_MATH_WRAP2'
    return std::FUN(x, y);                                                     \
           ^~~~~~~~
/workspace/codes/sycl-bench/single-kernel/sobel7.cpp:89:34: note: in instantiation of function template specialization 'trisycl::hypot<trisycl::vec<float, 4> >' requested here
        cl::sycl::float4 color = hypot(Gx, Gy);
                                 ^
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:147:13: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'double' for 1st argument
__MATHCALL (hypot,, (_Mdouble_ __x, _Mdouble_ __y));
            ^
/usr/include/math.h:273:25: note: expanded from macro '__MATHCALL'
  __MATHDECL (_Mdouble_,function,suffix, args)
                        ^
/usr/include/math.h:275:22: note: expanded from macro '__MATHDECL'
  __MATHDECL_1(type, function,suffix, args); \
                     ^
/usr/include/math.h:283:31: note: expanded from macro '__MATHDECL_1'
  extern type __MATH_PRECNAME(function,suffix) args __THROW
                              ^
/usr/include/math.h:286:42: note: expanded from macro '__MATH_PRECNAME'
#define __MATH_PRECNAME(name,r) __CONCAT(name,r)
                                         ^
/usr/include/x86_64-linux-gnu/sys/cdefs.h:100:23: note: expanded from macro '__CONCAT'
#define __CONCAT(x,y)   x ## y
                        ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1473:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'float' for 1st argument
  hypot(float __x, float __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1477:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'long double' for 1st argument
  hypot(long double __x, long double __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1484:5: note: candidate template ignored: substitution failure [with _Tp = trisycl::vec<float, 4>, _Up = trisycl::vec<float, 4>]: no type named '__type' in '__gnu_cxx::__promote<trisycl::vec<float, 4>, false>'
    hypot(_Tp __x, _Up __y)
    ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1903:5: note: candidate function template not viable: requires 3 arguments, but 2 were provided
    hypot(_Tp __x, _Up __y, _Vp __z)
    ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1890:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(float __x, float __y, float __z)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1894:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(double __x, double __y, double __z)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1898:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(long double __x, long double __y, long double __z)
  ^
[ 60%] Built target arith
In file included from /workspace/codes/sycl-bench/single-kernel/sobel7.cpp:1:
In file included from /tmp/triSYCL-master/include/CL/sycl.hpp:10:
In file included from /tmp/triSYCL-master/include/triSYCL/sycl.hpp:66:
/tmp/triSYCL-master/include/triSYCL/math.hpp:83:1: error: no matching function for call to 'fdim'
TRISYCL_MATH_WRAP2(fdim)
^~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/math.hpp:34:12: note: expanded from macro 'TRISYCL_MATH_WRAP2'
    return std::FUN(x, y);                                                     \
           ^~~~~~~~
/workspace/codes/sycl-bench/single-kernel/sobel7.cpp:140:30: note: in instantiation of function template specialization 'trisycl::fdim<trisycl::vec<float, 4> >' requested here
      cl::sycl::float4 dif = fdim(output[i], expected);
                             ^
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:326:13: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'double' for 1st argument
__MATHCALL (fdim,, (_Mdouble_ __x, _Mdouble_ __y));
            ^
/usr/include/math.h:273:25: note: expanded from macro '__MATHCALL'
  __MATHDECL (_Mdouble_,function,suffix, args)
                        ^
/usr/include/math.h:275:22: note: expanded from macro '__MATHDECL'
  __MATHDECL_1(type, function,suffix, args); \
                     ^
/usr/include/math.h:283:31: note: expanded from macro '__MATHDECL_1'
  extern type __MATH_PRECNAME(function,suffix) args __THROW
                              ^
/usr/include/math.h:286:42: note: expanded from macro '__MATH_PRECNAME'
#define __MATH_PRECNAME(name,r) __CONCAT(name,r)
                                         ^
/usr/include/x86_64-linux-gnu/sys/cdefs.h:100:23: note: expanded from macro '__CONCAT'
#define __CONCAT(x,y)   x ## y
                        ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1393:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'float' for 1st argument
  fdim(float __x, float __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1397:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'long double' for 1st argument
  fdim(long double __x, long double __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1404:5: note: candidate template ignored: substitution failure [with _Tp = trisycl::vec<float, 4>, _Up = trisycl::vec<float, 4>]: no type named '__type' in '__gnu_cxx::__promote<trisycl::vec<float, 4>, false>'
    fdim(_Tp __x, _Up __y)
    ^
Scanning dependencies of target mol_dyn
2 warnings and 2 errors generated.
[ 61%] Building CXX object CMakeFiles/mol_dyn.dir/single-kernel/mol_dyn.cpp.o
CMakeFiles/sobel5.dir/build.make:62: recipe for target 'CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o' failed
make[2]: *** [CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o] Error 1
make[2]: Target 'CMakeFiles/sobel5.dir/build' not remade because of errors.
CMakeFiles/Makefile2:960: recipe for target 'CMakeFiles/sobel5.dir/all' failed
make[1]: *** [CMakeFiles/sobel5.dir/all] Error 2
[ 61%] Built target local_mem
Scanning dependencies of target gesummv
Scanning dependencies of target 2mm
[ 63%] Building CXX object CMakeFiles/gesummv.dir/polybench/gesummv.cpp.o
[ 64%] Building CXX object CMakeFiles/2mm.dir/polybench/2mm.cpp.o
2 warnings and 2 errors generated.
CMakeFiles/sobel7.dir/build.make:62: recipe for target 'CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o' failed
make[2]: *** [CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o] Error 1
make[2]: Target 'CMakeFiles/sobel7.dir/build' not remade because of errors.
CMakeFiles/Makefile2:1034: recipe for target 'CMakeFiles/sobel7.dir/all' failed
make[1]: *** [CMakeFiles/sobel7.dir/all] Error 2
Scanning dependencies of target sf
[ 65%] Building CXX object CMakeFiles/sf.dir/micro/sf.cpp.o
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:152:39: error: no matching member function for call to 'get_access'
          auto acc = buffer->template get_access<s::access::mode::discard_write>(
                     ~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'MicroBenchHostDeviceBandwidth<1, CopyDirection::HOST_TO_DEVICE, true>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<MicroBenchHostDeviceBandwidth<1, CopyDirection::HOST_TO_DEVICE, true> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:248:7: note: in instantiation of function template specialization 'BenchmarkApp::run<MicroBenchHostDeviceBandwidth<1, CopyDirection::HOST_TO_DEVICE, true>>' requested here
  app.run<MicroBenchHostDeviceBandwidth<1, CopyDirection::HOST_TO_DEVICE, true>>();
      ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 3 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 3 were provided
  get_access() {
  ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:152:39: error: no matching member function for call to 'get_access'
          auto acc = buffer->template get_access<s::access::mode::discard_write>(
                     ~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'MicroBenchHostDeviceBandwidth<2, CopyDirection::HOST_TO_DEVICE, true>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<MicroBenchHostDeviceBandwidth<2, CopyDirection::HOST_TO_DEVICE, true> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:249:7: note: in instantiation of function template specialization 'BenchmarkApp::run<MicroBenchHostDeviceBandwidth<2, CopyDirection::HOST_TO_DEVICE, true>>' requested here
  app.run<MicroBenchHostDeviceBandwidth<2, CopyDirection::HOST_TO_DEVICE, true>>();
      ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 3 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 3 were provided
  get_access() {
  ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:152:39: error: no matching member function for call to 'get_access'
          auto acc = buffer->template get_access<s::access::mode::discard_write>(
                     ~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'MicroBenchHostDeviceBandwidth<3, CopyDirection::HOST_TO_DEVICE, true>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<MicroBenchHostDeviceBandwidth<3, CopyDirection::HOST_TO_DEVICE, true> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:250:7: note: in instantiation of function template specialization 'BenchmarkApp::run<MicroBenchHostDeviceBandwidth<3, CopyDirection::HOST_TO_DEVICE, true>>' requested here
  app.run<MicroBenchHostDeviceBandwidth<3, CopyDirection::HOST_TO_DEVICE, true>>();
      ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 3 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 3 were provided
  get_access() {
  ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:161:32: error: no matching member function for call to 'get_access'
              buffer->template get_access<s::access::mode::read>(cgh, copy_size, getStridedCopyOffset<Dims, true>());
              ~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'MicroBenchHostDeviceBandwidth<1, CopyDirection::DEVICE_TO_HOST, true>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<MicroBenchHostDeviceBandwidth<1, CopyDirection::DEVICE_TO_HOST, true> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:252:7: note: in instantiation of function template specialization 'BenchmarkApp::run<MicroBenchHostDeviceBandwidth<1, CopyDirection::DEVICE_TO_HOST, true>>' requested here
  app.run<MicroBenchHostDeviceBandwidth<1, CopyDirection::DEVICE_TO_HOST, true>>();
      ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 3 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 3 were provided
  get_access() {
  ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:161:32: error: no matching member function for call to 'get_access'
              buffer->template get_access<s::access::mode::read>(cgh, copy_size, getStridedCopyOffset<Dims, true>());
              ~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'MicroBenchHostDeviceBandwidth<2, CopyDirection::DEVICE_TO_HOST, true>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<MicroBenchHostDeviceBandwidth<2, CopyDirection::DEVICE_TO_HOST, true> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:253:7: note: in instantiation of function template specialization 'BenchmarkApp::run<MicroBenchHostDeviceBandwidth<2, CopyDirection::DEVICE_TO_HOST, true>>' requested here
  app.run<MicroBenchHostDeviceBandwidth<2, CopyDirection::DEVICE_TO_HOST, true>>();
      ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 3 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 3 were provided
  get_access() {
  ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:161:32: error: no matching member function for call to 'get_access'
              buffer->template get_access<s::access::mode::read>(cgh, copy_size, getStridedCopyOffset<Dims, true>());
              ~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'MicroBenchHostDeviceBandwidth<3, CopyDirection::DEVICE_TO_HOST, true>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<MicroBenchHostDeviceBandwidth<3, CopyDirection::DEVICE_TO_HOST, true> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:254:7: note: in instantiation of function template specialization 'BenchmarkApp::run<MicroBenchHostDeviceBandwidth<3, CopyDirection::DEVICE_TO_HOST, true>>' requested here
  app.run<MicroBenchHostDeviceBandwidth<3, CopyDirection::DEVICE_TO_HOST, true>>();
      ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 3 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 3 were provided
  get_access() {
  ^
8 errors generated.
In file included from /workspace/codes/sycl-bench/polybench/2DConvolution.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
CMakeFiles/host_device_bandwidth.dir/build.make:62: recipe for target 'CMakeFiles/host_device_bandwidth.dir/micro/host_device_bandwidth.cpp.o' failed
make[2]: *** [CMakeFiles/host_device_bandwidth.dir/micro/host_device_bandwidth.cpp.o] Error 1
make[2]: Target 'CMakeFiles/host_device_bandwidth.dir/build' not remade because of errors.
CMakeFiles/Makefile2:997: recipe for target 'CMakeFiles/host_device_bandwidth.dir/all' failed
make[1]: *** [CMakeFiles/host_device_bandwidth.dir/all] Error 2
Scanning dependencies of target nbody
[ 67%] Building CXX object CMakeFiles/nbody.dir/single-kernel/nbody.cpp.o
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:90:42: error: no member named 'rsqrt' in namespace 'cl::sycl'; did you mean 'sqrt'?
          const float_type r_inv = sycl::rsqrt(R.x() * R.x() + R.y() * R.y() + R.z() * R.z() + 
                                   ~~~~~~^~~~~
                                         sqrt
/tmp/triSYCL-master/include/triSYCL/math.hpp:153:19: note: 'sqrt' declared here
TRISYCL_MATH_WRAP(sqrt)
                  ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:185:27: error: no member named 'rsqrt' in namespace 'cl::sycl'; did you mean 'sqrt'?
                    sycl::rsqrt(R.x() * R.x() + R.y() * R.y() + R.z() * R.z() + gravitational_softening);
                    ~~~~~~^~~~~
                          sqrt
/tmp/triSYCL-master/include/triSYCL/math.hpp:153:19: note: 'sqrt' declared here
TRISYCL_MATH_WRAP(sqrt)
                  ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:232:19: error: no member named 'private_memory' in namespace 'cl::sycl'
            sycl::private_memory<particle_type> my_particle{grp};
            ~~~~~~^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:232:34: error: unexpected type name 'particle_type': expected expression
            sycl::private_memory<particle_type> my_particle{grp};
                                 ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:232:49: error: use of undeclared identifier 'my_particle'; did you mean 'particles'?
            sycl::private_memory<particle_type> my_particle{grp};
                                                ^~~~~~~~~~~
                                                particles
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:213:56: note: 'particles' declared here
  void submitHierarchical(sycl::buffer<particle_type>& particles, sycl::buffer<vector_type>& velocities) {
                                                       ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:233:19: error: no member named 'private_memory' in namespace 'cl::sycl'
            sycl::private_memory<vector_type> acceleration{grp};
            ~~~~~~^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:233:34: error: unexpected type name 'vector_type': expected expression
            sycl::private_memory<vector_type> acceleration{grp};
                                 ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:233:47: error: use of undeclared identifier 'acceleration'
            sycl::private_memory<vector_type> acceleration{grp};
                                              ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:236:15: error: use of undeclared identifier 'acceleration'
              acceleration(idx) = vector_type{static_cast<float_type>(0.0f)};
              ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:237:15: error: use of undeclared identifier 'my_particle'
              my_particle(idx) = (idx.get_global_id(0) < problem_size) ? particles_access[idx.get_global_id(0)]
              ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:252:46: error: use of undeclared identifier 'my_particle'
                  const particle_type my_p = my_particle(idx);
                                             ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:257:29: error: no member named 'rsqrt' in namespace 'cl::sycl'; did you mean 'sqrt'?
                      sycl::rsqrt(R.x() * R.x() + R.y() * R.y() + R.z() * R.z() + gravitational_softening);
                      ~~~~~~^~~~~
                            sqrt
/tmp/triSYCL-master/include/triSYCL/math.hpp:153:19: note: 'sqrt' declared here
TRISYCL_MATH_WRAP(sqrt)
                  ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:261:21: error: use of undeclared identifier 'acceleration'
                    acceleration(idx) += static_cast<float_type>(p.w()) * r_inv * r_inv * r_inv * R;
                    ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:273:20: error: use of undeclared identifier 'acceleration'
              v += acceleration(idx) * dt;
                   ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:276:36: error: use of undeclared identifier 'my_particle'
              particle_type my_p = my_particle(idx);
                                   ^
In file included from /workspace/codes/sycl-bench/polybench/2mm.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/polybench/gesummv.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:252:39: error: default initialization of an object of const type 'const NBody<float>::particle_type' (aka 'const vec<float, 4>') without a user-provided default constructor
                  const particle_type my_p = my_particle(idx);
                                      ^
                                          {}
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:327:11: note: in instantiation of member function 'NBody<float>::submitHierarchical' requested here
    this->submitHierarchical(this->particles_buf.get(), this->velocities_buf.get());
          ^
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'NBodyHierarchical<float>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<NBodyHierarchical<float> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:344:7: note: in instantiation of function template specialization 'BenchmarkApp::run<NBodyHierarchical<float>>' requested here
  app.run< NBodyHierarchical<float> >();
      ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:252:39: error: default initialization of an object of const type 'const NBody<double>::particle_type' (aka 'const vec<double, 4>') without a user-provided default constructor
                  const particle_type my_p = my_particle(idx);
                                      ^
                                          {}
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:327:11: note: in instantiation of member function 'NBody<double>::submitHierarchical' requested here
    this->submitHierarchical(this->particles_buf.get(), this->velocities_buf.get());
          ^
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'NBodyHierarchical<double>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<NBodyHierarchical<double> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:345:7: note: in instantiation of function template specialization 'BenchmarkApp::run<NBodyHierarchical<double>>' requested here
  app.run< NBodyHierarchical<double> >();
      ^
17 errors generated.
CMakeFiles/nbody.dir/build.make:62: recipe for target 'CMakeFiles/nbody.dir/single-kernel/nbody.cpp.o' failed
make[2]: *** [CMakeFiles/nbody.dir/single-kernel/nbody.cpp.o] Error 1
make[2]: Target 'CMakeFiles/nbody.dir/build' not remade because of errors.
CMakeFiles/Makefile2:1404: recipe for target 'CMakeFiles/nbody.dir/all' failed
make[1]: *** [CMakeFiles/nbody.dir/all] Error 2
Scanning dependencies of target dag_task_throughput_sequential
[ 68%] Linking CXX executable dag_task_throughput_independent
[ 69%] Building CXX object CMakeFiles/dag_task_throughput_sequential.dir/runtime/dag_task_throughput_sequential.cpp.o
[ 69%] Built target dag_task_throughput_independent
1 warning generated.
[ 71%] Linking CXX executable atax
[ 71%] Built target atax
1 warning generated.
[ 72%] Linking CXX executable 2DConvolution
[ 72%] Built target 2DConvolution
[ 73%] Linking CXX executable kmeans
[ 73%] Built target kmeans
[ 75%] Linking CXX executable lin_reg_error
[ 75%] Built target lin_reg_error
1 warning generated.
[ 76%] Linking CXX executable gesummv
1 warning generated.
[ 77%] Linking CXX executable 2mm
[ 78%] Linking CXX executable mol_dyn
[ 78%] Built target gesummv
[ 78%] Built target 2mm
[ 78%] Built target mol_dyn
[ 80%] Linking CXX executable vec_add
[ 80%] Built target vec_add
[ 81%] Linking CXX executable sf
[ 81%] Built target sf
[ 82%] Linking CXX executable pattern_L2
[ 84%] Linking CXX executable lin_reg_coeff
[ 84%] Built target pattern_L2
[ 84%] Built target lin_reg_coeff
[ 85%] Linking CXX executable DRAM
[ 85%] Built target DRAM
[ 86%] Linking CXX executable dag_task_throughput_sequential
[ 86%] Built target dag_task_throughput_sequential
[ 88%] Linking CXX executable scalar_prod
[ 88%] Built target scalar_prod
[ 89%] Linking CXX executable segmentedreduction
[ 89%] Built target segmentedreduction
make[1]: Target 'all' not remade because of errors.
Makefile:129: recipe for target 'all' failed
make: *** [all] Error 2
make: Target 'default_target' not remade because of errors.

Or if you want to build with just TBB and no OpenMP use:

In [13]:
! rm -r ./trisycl-cpu-benchmarks
! mkdir ./trisycl-cpu-benchmarks && cd ./trisycl-cpu-benchmarks && cmake ../.. -DSYCL_IMPL=triSYCL -DTRISYCL_TBB=ON -DTRISYCL_OPENMP=OFF -DTRISYCL_INCLUDE_DIR=/tmp/triSYCL-master/include && make -j16 --keep-going
! cd ./trisycl-cpu-benchmarks && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
-- The C compiler identification is Clang 9.0.1
-- The CXX compiler identification is Clang 9.0.1
-- Check for working C compiler: /llvm-9.0.1/bin/clang
-- Check for working C compiler: /llvm-9.0.1/bin/clang -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++
-- Check for working CXX compiler: /llvm-9.0.1/bin/clang++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- host compiler - clang 9.0.1
-- Found triSYCL include directory: /tmp/triSYCL-master/include
-- Found OpenMP_C: -fopenmp=libomp (found version "3.1") 
-- Found OpenMP_CXX: -fopenmp=libomp (found version "3.1") 
-- Found OpenMP: TRUE (found version "3.1")  
-- Looking for pthread.h
-- Looking for pthread.h - found
-- Looking for pthread_create
-- Looking for pthread_create - not found
-- Looking for pthread_create in pthreads
-- Looking for pthread_create in pthreads - not found
-- Looking for pthread_create in pthread
-- Looking for pthread_create in pthread - found
-- Found Threads: TRUE  
-- Boost version: 1.65.1
-- Found the following Boost libraries:
--   chrono
--   log
--   system
--   date_time
--   log_setup
--   filesystem
--   thread
--   regex
--   atomic
-- triSYCL OpenMP:                   ON
-- triSYCL TBB:                      ON
-- triSYCL OpenCL:                   OFF
-- triSYCL synchronous execution:    OFF
-- triSYCL debug mode:               OFF
-- triSYCL object trace:             OFF
-- triSYCL kernel trace:             OFF
-- Configuring done
-- Generating done
-- Build files have been written to: /workspace/codes/sycl-bench/bin/trisycl-cpu-benchmarks
Scanning dependencies of target syrk
Scanning dependencies of target mvt
Scanning dependencies of target gemm
Scanning dependencies of target gramschmidt
Scanning dependencies of target fdtd2d
Scanning dependencies of target covariance
Scanning dependencies of target syr2k
Scanning dependencies of target bicg
Scanning dependencies of target 3DConvolution
Scanning dependencies of target median
Scanning dependencies of target matmulchain
Scanning dependencies of target 3mm
Scanning dependencies of target correlation
Scanning dependencies of target local_mem
Scanning dependencies of target arith
Scanning dependencies of target sobel
[  1%] Building CXX object CMakeFiles/mvt.dir/polybench/mvt.cpp.o
[  2%] Building CXX object CMakeFiles/syr2k.dir/polybench/syr2k.cpp.o
[  3%] Building CXX object CMakeFiles/covariance.dir/polybench/covariance.cpp.o
[  5%] Building CXX object CMakeFiles/gramschmidt.dir/polybench/gramschmidt.cpp.o
[  6%] Building CXX object CMakeFiles/gemm.dir/polybench/gemm.cpp.o
[  9%] Building CXX object CMakeFiles/local_mem.dir/micro/local_mem.cpp.o
[  9%] Building CXX object CMakeFiles/median.dir/single-kernel/median.cpp.o
[ 10%] Building CXX object CMakeFiles/fdtd2d.dir/polybench/fdtd2d.cpp.o
[ 11%] Building CXX object CMakeFiles/correlation.dir/polybench/correlation.cpp.o
[ 13%] Building CXX object CMakeFiles/matmulchain.dir/runtime/matmulchain.cpp.o
[ 14%] Building CXX object CMakeFiles/3mm.dir/polybench/3mm.cpp.o
[ 15%] Building CXX object CMakeFiles/syrk.dir/polybench/syrk.cpp.o
[ 17%] Building CXX object CMakeFiles/bicg.dir/polybench/bicg.cpp.o
[ 18%] Building CXX object CMakeFiles/arith.dir/micro/arith.cpp.o
[ 19%] Building CXX object CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o
[ 21%] Building CXX object CMakeFiles/3DConvolution.dir/polybench/3DConvolution.cpp.o
In file included from /workspace/codes/sycl-bench/single-kernel/sobel.cpp:5:
/workspace/codes/sycl-bench/include/bitmap.h:268:31: warning: missing field 'creator1' initializer [-Wmissing-field-initializers]
                bmpfile_header header = { 0 };
                                            ^
/workspace/codes/sycl-bench/include/bitmap.h:274:35: warning: missing field 'width' initializer [-Wmissing-field-initializers]
                bmpfile_dib_info dib_info = { 0 };
                                                ^
In file included from /workspace/codes/sycl-bench/single-kernel/median.cpp:5:
/workspace/codes/sycl-bench/include/bitmap.h:268:31: warning: missing field 'creator1' initializer [-Wmissing-field-initializers]
                bmpfile_header header = { 0 };
                                            ^
/workspace/codes/sycl-bench/include/bitmap.h:274:35: warning: missing field 'width' initializer [-Wmissing-field-initializers]
                bmpfile_dib_info dib_info = { 0 };
                                                ^
/workspace/codes/sycl-bench/polybench/fdtd2d.cpp:108:57: warning: lambda capture 'NX_' is not used [-Wunused-lambda-capture]
                                cgh.parallel_for<Fdtd2d2>(range<2>(size, size), [=, NX_ = size, NY_ = size](item<2> item) {
                                                                                  ~~^~~~~~~~~~
/workspace/codes/sycl-bench/polybench/fdtd2d.cpp:108:69: warning: lambda capture 'NY_' is not used [-Wunused-lambda-capture]
                                cgh.parallel_for<Fdtd2d2>(range<2>(size, size), [=, NX_ = size, NY_ = size](item<2> item) {
                                                                                              ~~^~~~~~~~~~
In file included from /workspace/codes/sycl-bench/polybench/gemm.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/polybench/syr2k.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/single-kernel/sobel.cpp:1:
In file included from /tmp/triSYCL-master/include/CL/sycl.hpp:10:
In file included from /tmp/triSYCL-master/include/triSYCL/sycl.hpp:66:
/tmp/triSYCL-master/include/triSYCL/math.hpp:95:1: error: no matching function for call to 'hypot'
TRISYCL_MATH_WRAP2(hypot)
^~~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/math.hpp:34:12: note: expanded from macro 'TRISYCL_MATH_WRAP2'
    return std::FUN(x, y);                                                     \
           ^~~~~~~~
/workspace/codes/sycl-bench/single-kernel/sobel.cpp:87:34: note: in instantiation of function template specialization 'trisycl::hypot<trisycl::vec<float, 4> >' requested here
        cl::sycl::float4 color = hypot(Gx, Gy);
                                 ^
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:147:13: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'double' for 1st argument
__MATHCALL (hypot,, (_Mdouble_ __x, _Mdouble_ __y));
            ^
/usr/include/math.h:273:25: note: expanded from macro '__MATHCALL'
  __MATHDECL (_Mdouble_,function,suffix, args)
                        ^
/usr/include/math.h:275:22: note: expanded from macro '__MATHDECL'
  __MATHDECL_1(type, function,suffix, args); \
                     ^
/usr/include/math.h:283:31: note: expanded from macro '__MATHDECL_1'
  extern type __MATH_PRECNAME(function,suffix) args __THROW
                              ^
/usr/include/math.h:286:42: note: expanded from macro '__MATH_PRECNAME'
#define __MATH_PRECNAME(name,r) __CONCAT(name,r)
                                         ^
/usr/include/x86_64-linux-gnu/sys/cdefs.h:100:23: note: expanded from macro '__CONCAT'
#define __CONCAT(x,y)   x ## y
                        ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1473:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'float' for 1st argument
  hypot(float __x, float __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1477:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'long double' for 1st argument
  hypot(long double __x, long double __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1484:5: note: candidate template ignored: substitution failure [with _Tp = trisycl::vec<float, 4>, _Up = trisycl::vec<float, 4>]: no type named '__type' in '__gnu_cxx::__promote<trisycl::vec<float, 4>, false>'
    hypot(_Tp __x, _Up __y)
    ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1903:5: note: candidate function template not viable: requires 3 arguments, but 2 were provided
    hypot(_Tp __x, _Up __y, _Vp __z)
    ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1890:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(float __x, float __y, float __z)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1894:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(double __x, double __y, double __z)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1898:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(long double __x, long double __y, long double __z)
  ^
In file included from /workspace/codes/sycl-bench/single-kernel/median.cpp:1:
In file included from /tmp/triSYCL-master/include/CL/sycl.hpp:10:
In file included from /tmp/triSYCL-master/include/triSYCL/sycl.hpp:66:
/tmp/triSYCL-master/include/triSYCL/math.hpp:83:1: error: no matching function for call to 'fdim'
TRISYCL_MATH_WRAP2(fdim)
^~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/math.hpp:34:12: note: expanded from macro 'TRISYCL_MATH_WRAP2'
    return std::FUN(x, y);                                                     \
           ^~~~~~~~
/workspace/codes/sycl-bench/single-kernel/median.cpp:173:30: note: in instantiation of function template specialization 'trisycl::fdim<trisycl::vec<float, 4> >' requested here
      cl::sycl::float4 dif = fdim(output_acc.get_pointer()[i], expected);
                             ^
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:326:13: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'double' for 1st argument
__MATHCALL (fdim,, (_Mdouble_ __x, _Mdouble_ __y));
            ^
/usr/include/math.h:273:25: note: expanded from macro '__MATHCALL'
  __MATHDECL (_Mdouble_,function,suffix, args)
                        ^
/usr/include/math.h:275:22: note: expanded from macro '__MATHDECL'
  __MATHDECL_1(type, function,suffix, args); \
                     ^
/usr/include/math.h:283:31: note: expanded from macro '__MATHDECL_1'
  extern type __MATH_PRECNAME(function,suffix) args __THROW
                              ^
/usr/include/math.h:286:42: note: expanded from macro '__MATH_PRECNAME'
#define __MATH_PRECNAME(name,r) __CONCAT(name,r)
                                         ^
/usr/include/x86_64-linux-gnu/sys/cdefs.h:100:23: note: expanded from macro '__CONCAT'
#define __CONCAT(x,y)   x ## y
                        ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1393:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'float' for 1st argument
  fdim(float __x, float __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1397:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'long double' for 1st argument
  fdim(long double __x, long double __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1404:5: note: candidate template ignored: substitution failure [with _Tp = trisycl::vec<float, 4>, _Up = trisycl::vec<float, 4>]: no type named '__type' in '__gnu_cxx::__promote<trisycl::vec<float, 4>, false>'
    fdim(_Tp __x, _Up __y)
    ^
In file included from /workspace/codes/sycl-bench/polybench/mvt.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/polybench/gramschmidt.cpp:10:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/single-kernel/sobel.cpp:1:
In file included from /tmp/triSYCL-master/include/CL/sycl.hpp:10:
In file included from /tmp/triSYCL-master/include/triSYCL/sycl.hpp:66:
/tmp/triSYCL-master/include/triSYCL/math.hpp:83:1: error: no matching function for call to 'fdim'
TRISYCL_MATH_WRAP2(fdim)
^~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/math.hpp:34:12: note: expanded from macro 'TRISYCL_MATH_WRAP2'
    return std::FUN(x, y);                                                     \
           ^~~~~~~~
/workspace/codes/sycl-bench/single-kernel/sobel.cpp:131:30: note: in instantiation of function template specialization 'trisycl::fdim<trisycl::vec<float, 4> >' requested here
      cl::sycl::float4 dif = fdim(output[i], expected);
                             ^
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:326:13: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'double' for 1st argument
__MATHCALL (fdim,, (_Mdouble_ __x, _Mdouble_ __y));
            ^
/usr/include/math.h:273:25: note: expanded from macro '__MATHCALL'
  __MATHDECL (_Mdouble_,function,suffix, args)
                        ^
/usr/include/math.h:275:22: note: expanded from macro '__MATHDECL'
  __MATHDECL_1(type, function,suffix, args); \
                     ^
/usr/include/math.h:283:31: note: expanded from macro '__MATHDECL_1'
  extern type __MATH_PRECNAME(function,suffix) args __THROW
                              ^
/usr/include/math.h:286:42: note: expanded from macro '__MATH_PRECNAME'
#define __MATH_PRECNAME(name,r) __CONCAT(name,r)
                                         ^
/usr/include/x86_64-linux-gnu/sys/cdefs.h:100:23: note: expanded from macro '__CONCAT'
#define __CONCAT(x,y)   x ## y
                        ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1393:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'float' for 1st argument
  fdim(float __x, float __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1397:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'long double' for 1st argument
  fdim(long double __x, long double __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1404:5: note: candidate template ignored: substitution failure [with _Tp = trisycl::vec<float, 4>, _Up = trisycl::vec<float, 4>]: no type named '__type' in '__gnu_cxx::__promote<trisycl::vec<float, 4>, false>'
    fdim(_Tp __x, _Up __y)
    ^
In file included from /workspace/codes/sycl-bench/polybench/syrk.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/polybench/bicg.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
2 warnings and 1 error generated.
In file included from /workspace/codes/sycl-bench/polybench/covariance.cpp:10:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
2 warnings and 2 errors generated.
CMakeFiles/median.dir/build.make:62: recipe for target 'CMakeFiles/median.dir/single-kernel/median.cpp.o' failed
make[2]: *** [CMakeFiles/median.dir/single-kernel/median.cpp.o] Error 1
make[2]: Target 'CMakeFiles/median.dir/build' not remade because of errors.
CMakeFiles/Makefile2:479: recipe for target 'CMakeFiles/median.dir/all' failed
make[1]: *** [CMakeFiles/median.dir/all] Error 2
Scanning dependencies of target pattern_L2
CMakeFiles/sobel.dir/build.make:62: recipe for target 'CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o' failed
make[2]: *** [CMakeFiles/sobel.dir/single-kernel/sobel.cpp.o] Error 1
make[2]: Target 'CMakeFiles/sobel.dir/build' not remade because of errors.
CMakeFiles/Makefile2:627: recipe for target 'CMakeFiles/sobel.dir/all' failed
make[1]: *** [CMakeFiles/sobel.dir/all] Error 2
Scanning dependencies of target blocked_transform
[ 22%] Building CXX object CMakeFiles/pattern_L2.dir/micro/pattern_L2.cpp.o
In file included from /workspace/codes/sycl-bench/polybench/3DConvolution.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/polybench/fdtd2d.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
[ 23%] Building CXX object CMakeFiles/blocked_transform.dir/runtime/blocked_transform.cpp.o
In file included from /workspace/codes/sycl-bench/polybench/correlation.cpp:10:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/polybench/3mm.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
/workspace/codes/sycl-bench/runtime/blocked_transform.cpp:81:25: error: no matching member function for call to 'get_access'
        auto acc = buff.get_access<sycl::access::mode::read_write>(
                   ~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 3 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 3 were provided
  get_access() {
  ^
1 error generated.
CMakeFiles/blocked_transform.dir/build.make:62: recipe for target 'CMakeFiles/blocked_transform.dir/runtime/blocked_transform.cpp.o' failed
make[2]: *** [CMakeFiles/blocked_transform.dir/runtime/blocked_transform.cpp.o] Error 1
make[2]: Target 'CMakeFiles/blocked_transform.dir/build' not remade because of errors.
CMakeFiles/Makefile2:701: recipe for target 'CMakeFiles/blocked_transform.dir/all' failed
make[1]: *** [CMakeFiles/blocked_transform.dir/all] Error 2
Scanning dependencies of target dag_task_throughput_independent
[ 25%] Building CXX object CMakeFiles/dag_task_throughput_independent.dir/runtime/dag_task_throughput_independent.cpp.o
1 warning generated.
[ 26%] Linking CXX executable syr2k
1 warning generated.
[ 27%] Linking CXX executable matmulchain
[ 28%] Linking CXX executable gemm
1 warning generated.
[ 30%] Linking CXX executable syrk
[ 30%] Built target syr2k
Scanning dependencies of target kmeans
[ 30%] Built target matmulchain
Scanning dependencies of target reduction
[ 31%] Building CXX object CMakeFiles/kmeans.dir/single-kernel/kmeans.cpp.o
[ 32%] Building CXX object CMakeFiles/reduction.dir/pattern/reduction.cpp.o
[ 32%] Built target gemm
Scanning dependencies of target DRAM
[ 34%] Building CXX object CMakeFiles/DRAM.dir/micro/DRAM.cpp.o
[ 34%] Built target syrk
Scanning dependencies of target atax
1 warning generated.
[ 35%] Building CXX object CMakeFiles/atax.dir/polybench/atax.cpp.o
[ 36%] Linking CXX executable 3DConvolution
[ 36%] Built target 3DConvolution
Scanning dependencies of target scalar_prod
[ 38%] Building CXX object CMakeFiles/scalar_prod.dir/single-kernel/scalar_prod.cpp.o
1 warning generated.
1 warning generated.
[ 39%] Linking CXX executable mvt
[ 40%] Linking CXX executable bicg
1 warning generated.
[ 42%] Linking CXX executable gramschmidt
[ 42%] Built target mvt
Scanning dependencies of target sobel5
[ 42%] Built target bicg
[ 43%] Building CXX object CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o
Scanning dependencies of target host_device_bandwidth
[ 44%] Building CXX object CMakeFiles/host_device_bandwidth.dir/micro/host_device_bandwidth.cpp.o
[ 44%] Built target gramschmidt
Scanning dependencies of target sobel7
[ 46%] Building CXX object CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o
3 warnings generated.
[ 47%] Linking CXX executable fdtd2d
[ 47%] Built target fdtd2d
1 warning generated.
Scanning dependencies of target 2DConvolution
[ 48%] Linking CXX executable covariance
[ 50%] Building CXX object CMakeFiles/2DConvolution.dir/polybench/2DConvolution.cpp.o
[ 50%] Built target covariance
Scanning dependencies of target vec_add
[ 51%] Building CXX object CMakeFiles/vec_add.dir/single-kernel/vec_add.cpp.o
1 warning generated.
[ 52%] Linking CXX executable 3mm
[ 52%] Built target 3mm
Scanning dependencies of target lin_reg_error
1 warning generated.
[ 53%] Building CXX object CMakeFiles/lin_reg_error.dir/single-kernel/lin_reg_error.cpp.o
[ 55%] Linking CXX executable correlation
[ 55%] Built target correlation
Scanning dependencies of target segmentedreduction
[ 56%] Building CXX object CMakeFiles/segmentedreduction.dir/pattern/segmentedreduction.cpp.o
In file included from /workspace/codes/sycl-bench/single-kernel/sobel5.cpp:5:
/workspace/codes/sycl-bench/include/bitmap.h:268:31: warning: missing field 'creator1' initializer [-Wmissing-field-initializers]
                bmpfile_header header = { 0 };
                                            ^
/workspace/codes/sycl-bench/include/bitmap.h:274:35: warning: missing field 'width' initializer [-Wmissing-field-initializers]
                bmpfile_dib_info dib_info = { 0 };
                                                ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:154:15: error: no member named 'copy' in 'trisycl::handler'
          cgh.copy(host_data.data(), acc);
          ~~~ ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:162:15: error: no member named 'copy' in 'trisycl::handler'
          cgh.copy(acc, host_data.data());
          ~~~ ^
In file included from /workspace/codes/sycl-bench/single-kernel/sobel7.cpp:5:
/workspace/codes/sycl-bench/include/bitmap.h:268:31: warning: missing field 'creator1' initializer [-Wmissing-field-initializers]
                bmpfile_header header = { 0 };
                                            ^
/workspace/codes/sycl-bench/include/bitmap.h:274:35: warning: missing field 'width' initializer [-Wmissing-field-initializers]
                bmpfile_dib_info dib_info = { 0 };
                                                ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:63:45: error: no matching member function for call to 'get_access'
    T result = _final_output_buff->template get_access<sycl::access::mode::read>(
               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:118:21: note: in instantiation of member function 'Reduction<int>::verify' requested here
              if(!b.verify(args.verification)) {
                    ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<ReductionNDRange<int> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:242:9: note: in instantiation of function template specialization 'BenchmarkApp::run<ReductionNDRange<int>>' requested here
    app.run< ReductionNDRange<int>>();
        ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 2 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 2 were provided
  get_access() {
  ^
In file included from /workspace/codes/sycl-bench/polybench/atax.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:63:45: error: no matching member function for call to 'get_access'
    T result = _final_output_buff->template get_access<sycl::access::mode::read>(
               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:118:21: note: in instantiation of member function 'Reduction<long long>::verify' requested here
              if(!b.verify(args.verification)) {
                    ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<ReductionNDRange<long long> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:243:9: note: in instantiation of function template specialization 'BenchmarkApp::run<ReductionNDRange<long long>>' requested here
    app.run< ReductionNDRange<long long>>();
        ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 2 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 2 were provided
  get_access() {
  ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:63:45: error: no matching member function for call to 'get_access'
    T result = _final_output_buff->template get_access<sycl::access::mode::read>(
               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:118:21: note: in instantiation of member function 'Reduction<float>::verify' requested here
              if(!b.verify(args.verification)) {
                    ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<ReductionNDRange<float> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:244:9: note: in instantiation of function template specialization 'BenchmarkApp::run<ReductionNDRange<float>>' requested here
    app.run< ReductionNDRange<float>>();
        ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 2 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 2 were provided
  get_access() {
  ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:63:45: error: no matching member function for call to 'get_access'
    T result = _final_output_buff->template get_access<sycl::access::mode::read>(
               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:118:21: note: in instantiation of member function 'Reduction<double>::verify' requested here
              if(!b.verify(args.verification)) {
                    ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<ReductionNDRange<double> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/pattern/reduction.cpp:245:9: note: in instantiation of function template specialization 'BenchmarkApp::run<ReductionNDRange<double>>' requested here
    app.run< ReductionNDRange<double>>();
        ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 2 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 2 were provided
  get_access() {
  ^
4 errors generated.
CMakeFiles/reduction.dir/build.make:62: recipe for target 'CMakeFiles/reduction.dir/pattern/reduction.cpp.o' failed
make[2]: *** [CMakeFiles/reduction.dir/pattern/reduction.cpp.o] Error 1
make[2]: Target 'CMakeFiles/reduction.dir/build' not remade because of errors.
CMakeFiles/Makefile2:812: recipe for target 'CMakeFiles/reduction.dir/all' failed
make[1]: *** [CMakeFiles/reduction.dir/all] Error 2
[ 57%] Linking CXX executable arith
Scanning dependencies of target lin_reg_coeff
[ 59%] Building CXX object CMakeFiles/lin_reg_coeff.dir/single-kernel/lin_reg_coeff.cpp.o
[ 60%] Linking CXX executable local_mem
In file included from /workspace/codes/sycl-bench/single-kernel/sobel5.cpp:1:
In file included from /tmp/triSYCL-master/include/CL/sycl.hpp:10:
In file included from /tmp/triSYCL-master/include/triSYCL/sycl.hpp:66:
/tmp/triSYCL-master/include/triSYCL/math.hpp:95:1: error: no matching function for call to 'hypot'
TRISYCL_MATH_WRAP2(hypot)
^~~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/math.hpp:34:12: note: expanded from macro 'TRISYCL_MATH_WRAP2'
    return std::FUN(x, y);                                                     \
           ^~~~~~~~
/workspace/codes/sycl-bench/single-kernel/sobel5.cpp:98:36: note: in instantiation of function template specialization 'trisycl::hypot<trisycl::vec<float, 4> >' requested here
          cl::sycl::float4 color = hypot(Gx, Gy);
                                   ^
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:147:13: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'double' for 1st argument
__MATHCALL (hypot,, (_Mdouble_ __x, _Mdouble_ __y));
            ^
/usr/include/math.h:273:25: note: expanded from macro '__MATHCALL'
  __MATHDECL (_Mdouble_,function,suffix, args)
                        ^
/usr/include/math.h:275:22: note: expanded from macro '__MATHDECL'
  __MATHDECL_1(type, function,suffix, args); \
                     ^
/usr/include/math.h:283:31: note: expanded from macro '__MATHDECL_1'
  extern type __MATH_PRECNAME(function,suffix) args __THROW
                              ^
/usr/include/math.h:286:42: note: expanded from macro '__MATH_PRECNAME'
#define __MATH_PRECNAME(name,r) __CONCAT(name,r)
                                         ^
/usr/include/x86_64-linux-gnu/sys/cdefs.h:100:23: note: expanded from macro '__CONCAT'
#define __CONCAT(x,y)   x ## y
                        ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1473:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'float' for 1st argument
  hypot(float __x, float __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1477:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'long double' for 1st argument
  hypot(long double __x, long double __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1484:5: note: candidate template ignored: substitution failure [with _Tp = trisycl::vec<float, 4>, _Up = trisycl::vec<float, 4>]: no type named '__type' in '__gnu_cxx::__promote<trisycl::vec<float, 4>, false>'
    hypot(_Tp __x, _Up __y)
    ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1903:5: note: candidate function template not viable: requires 3 arguments, but 2 were provided
    hypot(_Tp __x, _Up __y, _Vp __z)
    ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1890:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(float __x, float __y, float __z)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1894:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(double __x, double __y, double __z)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1898:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(long double __x, long double __y, long double __z)
  ^
In file included from /workspace/codes/sycl-bench/single-kernel/sobel5.cpp:1:
In file included from /tmp/triSYCL-master/include/CL/sycl.hpp:10:
In file included from /tmp/triSYCL-master/include/triSYCL/sycl.hpp:66:
/tmp/triSYCL-master/include/triSYCL/math.hpp:83:1: error: no matching function for call to 'fdim'
TRISYCL_MATH_WRAP2(fdim)
^~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/math.hpp:34:12: note: expanded from macro 'TRISYCL_MATH_WRAP2'
    return std::FUN(x, y);                                                     \
           ^~~~~~~~
/workspace/codes/sycl-bench/single-kernel/sobel5.cpp:140:32: note: in instantiation of function template specialization 'trisycl::fdim<trisycl::vec<float, 4> >' requested here
        cl::sycl::float4 dif = fdim(output[i], expected);
                               ^
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:326:13: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'double' for 1st argument
__MATHCALL (fdim,, (_Mdouble_ __x, _Mdouble_ __y));
            ^
/usr/include/math.h:273:25: note: expanded from macro '__MATHCALL'
  __MATHDECL (_Mdouble_,function,suffix, args)
                        ^
/usr/include/math.h:275:22: note: expanded from macro '__MATHDECL'
  __MATHDECL_1(type, function,suffix, args); \
                     ^
/usr/include/math.h:283:31: note: expanded from macro '__MATHDECL_1'
  extern type __MATH_PRECNAME(function,suffix) args __THROW
                              ^
/usr/include/math.h:286:42: note: expanded from macro '__MATH_PRECNAME'
#define __MATH_PRECNAME(name,r) __CONCAT(name,r)
                                         ^
/usr/include/x86_64-linux-gnu/sys/cdefs.h:100:23: note: expanded from macro '__CONCAT'
#define __CONCAT(x,y)   x ## y
                        ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1393:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'float' for 1st argument
  fdim(float __x, float __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1397:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'long double' for 1st argument
  fdim(long double __x, long double __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1404:5: note: candidate template ignored: substitution failure [with _Tp = trisycl::vec<float, 4>, _Up = trisycl::vec<float, 4>]: no type named '__type' in '__gnu_cxx::__promote<trisycl::vec<float, 4>, false>'
    fdim(_Tp __x, _Up __y)
    ^
In file included from /workspace/codes/sycl-bench/single-kernel/sobel7.cpp:1:
In file included from /tmp/triSYCL-master/include/CL/sycl.hpp:10:
In file included from /tmp/triSYCL-master/include/triSYCL/sycl.hpp:66:
/tmp/triSYCL-master/include/triSYCL/math.hpp:95:1: error: no matching function for call to 'hypot'
TRISYCL_MATH_WRAP2(hypot)
^~~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/math.hpp:34:12: note: expanded from macro 'TRISYCL_MATH_WRAP2'
    return std::FUN(x, y);                                                     \
           ^~~~~~~~
/workspace/codes/sycl-bench/single-kernel/sobel7.cpp:89:34: note: in instantiation of function template specialization 'trisycl::hypot<trisycl::vec<float, 4> >' requested here
        cl::sycl::float4 color = hypot(Gx, Gy);
                                 ^
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:147:13: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'double' for 1st argument
__MATHCALL (hypot,, (_Mdouble_ __x, _Mdouble_ __y));
            ^
/usr/include/math.h:273:25: note: expanded from macro '__MATHCALL'
  __MATHDECL (_Mdouble_,function,suffix, args)
                        ^
/usr/include/math.h:275:22: note: expanded from macro '__MATHDECL'
  __MATHDECL_1(type, function,suffix, args); \
                     ^
/usr/include/math.h:283:31: note: expanded from macro '__MATHDECL_1'
  extern type __MATH_PRECNAME(function,suffix) args __THROW
                              ^
/usr/include/math.h:286:42: note: expanded from macro '__MATH_PRECNAME'
#define __MATH_PRECNAME(name,r) __CONCAT(name,r)
                                         ^
/usr/include/x86_64-linux-gnu/sys/cdefs.h:100:23: note: expanded from macro '__CONCAT'
#define __CONCAT(x,y)   x ## y
                        ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1473:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'float' for 1st argument
  hypot(float __x, float __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1477:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'long double' for 1st argument
  hypot(long double __x, long double __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1484:5: note: candidate template ignored: substitution failure [with _Tp = trisycl::vec<float, 4>, _Up = trisycl::vec<float, 4>]: no type named '__type' in '__gnu_cxx::__promote<trisycl::vec<float, 4>, false>'
    hypot(_Tp __x, _Up __y)
    ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1903:5: note: candidate function template not viable: requires 3 arguments, but 2 were provided
    hypot(_Tp __x, _Up __y, _Vp __z)
    ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1890:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(float __x, float __y, float __z)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1894:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(double __x, double __y, double __z)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1898:3: note: candidate function not viable: requires 3 arguments, but 2 were provided
  hypot(long double __x, long double __y, long double __z)
  ^
[ 60%] Built target arith
In file included from /workspace/codes/sycl-bench/single-kernel/sobel7.cpp:1:
In file included from /tmp/triSYCL-master/include/CL/sycl.hpp:10:
In file included from /tmp/triSYCL-master/include/triSYCL/sycl.hpp:66:
/tmp/triSYCL-master/include/triSYCL/math.hpp:83:1: error: no matching function for call to 'fdim'
TRISYCL_MATH_WRAP2(fdim)
^~~~~~~~~~~~~~~~~~~~~~~~
/tmp/triSYCL-master/include/triSYCL/math.hpp:34:12: note: expanded from macro 'TRISYCL_MATH_WRAP2'
    return std::FUN(x, y);                                                     \
           ^~~~~~~~
/workspace/codes/sycl-bench/single-kernel/sobel7.cpp:140:30: note: in instantiation of function template specialization 'trisycl::fdim<trisycl::vec<float, 4> >' requested here
      cl::sycl::float4 dif = fdim(output[i], expected);
                             ^
/usr/include/x86_64-linux-gnu/bits/mathcalls.h:326:13: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'double' for 1st argument
__MATHCALL (fdim,, (_Mdouble_ __x, _Mdouble_ __y));
            ^
/usr/include/math.h:273:25: note: expanded from macro '__MATHCALL'
  __MATHDECL (_Mdouble_,function,suffix, args)
                        ^
/usr/include/math.h:275:22: note: expanded from macro '__MATHDECL'
  __MATHDECL_1(type, function,suffix, args); \
                     ^
/usr/include/math.h:283:31: note: expanded from macro '__MATHDECL_1'
  extern type __MATH_PRECNAME(function,suffix) args __THROW
                              ^
/usr/include/math.h:286:42: note: expanded from macro '__MATH_PRECNAME'
#define __MATH_PRECNAME(name,r) __CONCAT(name,r)
                                         ^
/usr/include/x86_64-linux-gnu/sys/cdefs.h:100:23: note: expanded from macro '__CONCAT'
#define __CONCAT(x,y)   x ## y
                        ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1393:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'float' for 1st argument
  fdim(float __x, float __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1397:3: note: candidate function not viable: no known conversion from 'trisycl::vec<float, 4>' to 'long double' for 1st argument
  fdim(long double __x, long double __y)
  ^
/usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/cmath:1404:5: note: candidate template ignored: substitution failure [with _Tp = trisycl::vec<float, 4>, _Up = trisycl::vec<float, 4>]: no type named '__type' in '__gnu_cxx::__promote<trisycl::vec<float, 4>, false>'
    fdim(_Tp __x, _Up __y)
    ^
Scanning dependencies of target mol_dyn
2 warnings and 2 errors generated.
[ 61%] Building CXX object CMakeFiles/mol_dyn.dir/single-kernel/mol_dyn.cpp.o
CMakeFiles/sobel5.dir/build.make:62: recipe for target 'CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o' failed
make[2]: *** [CMakeFiles/sobel5.dir/single-kernel/sobel5.cpp.o] Error 1
make[2]: Target 'CMakeFiles/sobel5.dir/build' not remade because of errors.
CMakeFiles/Makefile2:960: recipe for target 'CMakeFiles/sobel5.dir/all' failed
make[1]: *** [CMakeFiles/sobel5.dir/all] Error 2
[ 61%] Built target local_mem
Scanning dependencies of target gesummv
Scanning dependencies of target 2mm
[ 63%] Building CXX object CMakeFiles/gesummv.dir/polybench/gesummv.cpp.o
[ 64%] Building CXX object CMakeFiles/2mm.dir/polybench/2mm.cpp.o
2 warnings and 2 errors generated.
CMakeFiles/sobel7.dir/build.make:62: recipe for target 'CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o' failed
make[2]: *** [CMakeFiles/sobel7.dir/single-kernel/sobel7.cpp.o] Error 1
make[2]: Target 'CMakeFiles/sobel7.dir/build' not remade because of errors.
CMakeFiles/Makefile2:1034: recipe for target 'CMakeFiles/sobel7.dir/all' failed
make[1]: *** [CMakeFiles/sobel7.dir/all] Error 2
Scanning dependencies of target sf
[ 65%] Building CXX object CMakeFiles/sf.dir/micro/sf.cpp.o
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:152:39: error: no matching member function for call to 'get_access'
          auto acc = buffer->template get_access<s::access::mode::discard_write>(
                     ~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'MicroBenchHostDeviceBandwidth<1, CopyDirection::HOST_TO_DEVICE, true>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<MicroBenchHostDeviceBandwidth<1, CopyDirection::HOST_TO_DEVICE, true> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:248:7: note: in instantiation of function template specialization 'BenchmarkApp::run<MicroBenchHostDeviceBandwidth<1, CopyDirection::HOST_TO_DEVICE, true>>' requested here
  app.run<MicroBenchHostDeviceBandwidth<1, CopyDirection::HOST_TO_DEVICE, true>>();
      ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 3 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 3 were provided
  get_access() {
  ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:152:39: error: no matching member function for call to 'get_access'
          auto acc = buffer->template get_access<s::access::mode::discard_write>(
                     ~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'MicroBenchHostDeviceBandwidth<2, CopyDirection::HOST_TO_DEVICE, true>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<MicroBenchHostDeviceBandwidth<2, CopyDirection::HOST_TO_DEVICE, true> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:249:7: note: in instantiation of function template specialization 'BenchmarkApp::run<MicroBenchHostDeviceBandwidth<2, CopyDirection::HOST_TO_DEVICE, true>>' requested here
  app.run<MicroBenchHostDeviceBandwidth<2, CopyDirection::HOST_TO_DEVICE, true>>();
      ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 3 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 3 were provided
  get_access() {
  ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:152:39: error: no matching member function for call to 'get_access'
          auto acc = buffer->template get_access<s::access::mode::discard_write>(
                     ~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'MicroBenchHostDeviceBandwidth<3, CopyDirection::HOST_TO_DEVICE, true>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<MicroBenchHostDeviceBandwidth<3, CopyDirection::HOST_TO_DEVICE, true> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:250:7: note: in instantiation of function template specialization 'BenchmarkApp::run<MicroBenchHostDeviceBandwidth<3, CopyDirection::HOST_TO_DEVICE, true>>' requested here
  app.run<MicroBenchHostDeviceBandwidth<3, CopyDirection::HOST_TO_DEVICE, true>>();
      ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 3 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 3 were provided
  get_access() {
  ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:161:32: error: no matching member function for call to 'get_access'
              buffer->template get_access<s::access::mode::read>(cgh, copy_size, getStridedCopyOffset<Dims, true>());
              ~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'MicroBenchHostDeviceBandwidth<1, CopyDirection::DEVICE_TO_HOST, true>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<MicroBenchHostDeviceBandwidth<1, CopyDirection::DEVICE_TO_HOST, true> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:252:7: note: in instantiation of function template specialization 'BenchmarkApp::run<MicroBenchHostDeviceBandwidth<1, CopyDirection::DEVICE_TO_HOST, true>>' requested here
  app.run<MicroBenchHostDeviceBandwidth<1, CopyDirection::DEVICE_TO_HOST, true>>();
      ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 3 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 3 were provided
  get_access() {
  ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:161:32: error: no matching member function for call to 'get_access'
              buffer->template get_access<s::access::mode::read>(cgh, copy_size, getStridedCopyOffset<Dims, true>());
              ~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'MicroBenchHostDeviceBandwidth<2, CopyDirection::DEVICE_TO_HOST, true>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<MicroBenchHostDeviceBandwidth<2, CopyDirection::DEVICE_TO_HOST, true> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:253:7: note: in instantiation of function template specialization 'BenchmarkApp::run<MicroBenchHostDeviceBandwidth<2, CopyDirection::DEVICE_TO_HOST, true>>' requested here
  app.run<MicroBenchHostDeviceBandwidth<2, CopyDirection::DEVICE_TO_HOST, true>>();
      ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 3 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 3 were provided
  get_access() {
  ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:161:32: error: no matching member function for call to 'get_access'
              buffer->template get_access<s::access::mode::read>(cgh, copy_size, getStridedCopyOffset<Dims, true>());
              ~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'MicroBenchHostDeviceBandwidth<3, CopyDirection::DEVICE_TO_HOST, true>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<MicroBenchHostDeviceBandwidth<3, CopyDirection::DEVICE_TO_HOST, true> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/micro/host_device_bandwidth.cpp:254:7: note: in instantiation of function template specialization 'BenchmarkApp::run<MicroBenchHostDeviceBandwidth<3, CopyDirection::DEVICE_TO_HOST, true>>' requested here
  app.run<MicroBenchHostDeviceBandwidth<3, CopyDirection::DEVICE_TO_HOST, true>>();
      ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:346:3: note: candidate function template not viable: requires single argument 'command_group_handler', but 3 arguments were provided
  get_access(handler &command_group_handler) {
  ^
/tmp/triSYCL-master/include/triSYCL/buffer.hpp:375:3: note: candidate function template not viable: requires 0 arguments, but 3 were provided
  get_access() {
  ^
8 errors generated.
In file included from /workspace/codes/sycl-bench/polybench/2DConvolution.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
CMakeFiles/host_device_bandwidth.dir/build.make:62: recipe for target 'CMakeFiles/host_device_bandwidth.dir/micro/host_device_bandwidth.cpp.o' failed
make[2]: *** [CMakeFiles/host_device_bandwidth.dir/micro/host_device_bandwidth.cpp.o] Error 1
make[2]: Target 'CMakeFiles/host_device_bandwidth.dir/build' not remade because of errors.
CMakeFiles/Makefile2:997: recipe for target 'CMakeFiles/host_device_bandwidth.dir/all' failed
make[1]: *** [CMakeFiles/host_device_bandwidth.dir/all] Error 2
Scanning dependencies of target nbody
[ 67%] Building CXX object CMakeFiles/nbody.dir/single-kernel/nbody.cpp.o
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:90:42: error: no member named 'rsqrt' in namespace 'cl::sycl'; did you mean 'sqrt'?
          const float_type r_inv = sycl::rsqrt(R.x() * R.x() + R.y() * R.y() + R.z() * R.z() + 
                                   ~~~~~~^~~~~
                                         sqrt
/tmp/triSYCL-master/include/triSYCL/math.hpp:153:19: note: 'sqrt' declared here
TRISYCL_MATH_WRAP(sqrt)
                  ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:185:27: error: no member named 'rsqrt' in namespace 'cl::sycl'; did you mean 'sqrt'?
                    sycl::rsqrt(R.x() * R.x() + R.y() * R.y() + R.z() * R.z() + gravitational_softening);
                    ~~~~~~^~~~~
                          sqrt
/tmp/triSYCL-master/include/triSYCL/math.hpp:153:19: note: 'sqrt' declared here
TRISYCL_MATH_WRAP(sqrt)
                  ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:232:19: error: no member named 'private_memory' in namespace 'cl::sycl'
            sycl::private_memory<particle_type> my_particle{grp};
            ~~~~~~^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:232:34: error: unexpected type name 'particle_type': expected expression
            sycl::private_memory<particle_type> my_particle{grp};
                                 ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:232:49: error: use of undeclared identifier 'my_particle'; did you mean 'particles'?
            sycl::private_memory<particle_type> my_particle{grp};
                                                ^~~~~~~~~~~
                                                particles
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:213:56: note: 'particles' declared here
  void submitHierarchical(sycl::buffer<particle_type>& particles, sycl::buffer<vector_type>& velocities) {
                                                       ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:233:19: error: no member named 'private_memory' in namespace 'cl::sycl'
            sycl::private_memory<vector_type> acceleration{grp};
            ~~~~~~^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:233:34: error: unexpected type name 'vector_type': expected expression
            sycl::private_memory<vector_type> acceleration{grp};
                                 ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:233:47: error: use of undeclared identifier 'acceleration'
            sycl::private_memory<vector_type> acceleration{grp};
                                              ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:236:15: error: use of undeclared identifier 'acceleration'
              acceleration(idx) = vector_type{static_cast<float_type>(0.0f)};
              ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:237:15: error: use of undeclared identifier 'my_particle'
              my_particle(idx) = (idx.get_global_id(0) < problem_size) ? particles_access[idx.get_global_id(0)]
              ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:252:46: error: use of undeclared identifier 'my_particle'
                  const particle_type my_p = my_particle(idx);
                                             ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:257:29: error: no member named 'rsqrt' in namespace 'cl::sycl'; did you mean 'sqrt'?
                      sycl::rsqrt(R.x() * R.x() + R.y() * R.y() + R.z() * R.z() + gravitational_softening);
                      ~~~~~~^~~~~
                            sqrt
/tmp/triSYCL-master/include/triSYCL/math.hpp:153:19: note: 'sqrt' declared here
TRISYCL_MATH_WRAP(sqrt)
                  ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:261:21: error: use of undeclared identifier 'acceleration'
                    acceleration(idx) += static_cast<float_type>(p.w()) * r_inv * r_inv * r_inv * R;
                    ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:273:20: error: use of undeclared identifier 'acceleration'
              v += acceleration(idx) * dt;
                   ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:276:36: error: use of undeclared identifier 'my_particle'
              particle_type my_p = my_particle(idx);
                                   ^
In file included from /workspace/codes/sycl-bench/polybench/2mm.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
In file included from /workspace/codes/sycl-bench/polybench/gesummv.cpp:9:
/workspace/codes/sycl-bench/polybench/common/polybenchUtilFuncts.h:43:13: warning: unused function 'shouldDoCpu' [-Wunused-function]
static bool shouldDoCpu(void) {
            ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:252:39: error: default initialization of an object of const type 'const NBody<float>::particle_type' (aka 'const vec<float, 4>') without a user-provided default constructor
                  const particle_type my_p = my_particle(idx);
                                      ^
                                          {}
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:327:11: note: in instantiation of member function 'NBody<float>::submitHierarchical' requested here
    this->submitHierarchical(this->particles_buf.get(), this->velocities_buf.get());
          ^
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'NBodyHierarchical<float>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<NBodyHierarchical<float> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:344:7: note: in instantiation of function template specialization 'BenchmarkApp::run<NBodyHierarchical<float>>' requested here
  app.run< NBodyHierarchical<float> >();
      ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:252:39: error: default initialization of an object of const type 'const NBody<double>::particle_type' (aka 'const vec<double, 4>') without a user-provided default constructor
                  const particle_type my_p = my_particle(idx);
                                      ^
                                          {}
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:327:11: note: in instantiation of member function 'NBody<double>::submitHierarchical' requested here
    this->submitHierarchical(this->particles_buf.get(), this->velocities_buf.get());
          ^
/workspace/codes/sycl-bench/include/common.h:89:13: note: in instantiation of member function 'NBodyHierarchical<double>::run' requested here
          b.run();
            ^
/workspace/codes/sycl-bench/include/common.h:220:11: note: in instantiation of function template specialization 'BenchmarkManager<NBodyHierarchical<double> >::run<>' requested here
      mgr.run(additional_args...);
          ^
/workspace/codes/sycl-bench/single-kernel/nbody.cpp:345:7: note: in instantiation of function template specialization 'BenchmarkApp::run<NBodyHierarchical<double>>' requested here
  app.run< NBodyHierarchical<double> >();
      ^
17 errors generated.
CMakeFiles/nbody.dir/build.make:62: recipe for target 'CMakeFiles/nbody.dir/single-kernel/nbody.cpp.o' failed
make[2]: *** [CMakeFiles/nbody.dir/single-kernel/nbody.cpp.o] Error 1
make[2]: Target 'CMakeFiles/nbody.dir/build' not remade because of errors.
CMakeFiles/Makefile2:1404: recipe for target 'CMakeFiles/nbody.dir/all' failed
make[1]: *** [CMakeFiles/nbody.dir/all] Error 2
Scanning dependencies of target dag_task_throughput_sequential
[ 68%] Linking CXX executable dag_task_throughput_independent
[ 69%] Building CXX object CMakeFiles/dag_task_throughput_sequential.dir/runtime/dag_task_throughput_sequential.cpp.o
[ 69%] Built target dag_task_throughput_independent
1 warning generated.
[ 71%] Linking CXX executable atax
[ 71%] Built target atax
1 warning generated.
[ 72%] Linking CXX executable 2DConvolution
[ 72%] Built target 2DConvolution
[ 73%] Linking CXX executable kmeans
[ 73%] Built target kmeans
[ 75%] Linking CXX executable lin_reg_error
[ 75%] Built target lin_reg_error
1 warning generated.
[ 76%] Linking CXX executable gesummv
1 warning generated.
[ 77%] Linking CXX executable 2mm
[ 78%] Linking CXX executable mol_dyn
[ 78%] Built target gesummv
[ 78%] Built target 2mm
[ 78%] Built target mol_dyn
[ 80%] Linking CXX executable vec_add
[ 80%] Built target vec_add
[ 81%] Linking CXX executable sf
[ 81%] Built target sf
[ 82%] Linking CXX executable pattern_L2
[ 84%] Linking CXX executable lin_reg_coeff
[ 84%] Built target pattern_L2
[ 84%] Built target lin_reg_coeff
[ 85%] Linking CXX executable DRAM
[ 85%] Built target DRAM
[ 86%] Linking CXX executable dag_task_throughput_sequential
[ 86%] Built target dag_task_throughput_sequential
[ 88%] Linking CXX executable scalar_prod
[ 88%] Built target scalar_prod
[ 89%] Linking CXX executable segmentedreduction
[ 89%] Built target segmentedreduction
make[1]: Target 'all' not remade because of errors.
Makefile:129: recipe for target 'all' failed
make: *** [all] Error 2
make: Target 'default_target' not remade because of errors.

Or with OpenMP and no TBB:

In [ ]:
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib CC=/opt/hipSYCL/llvm/bin/clang CXX=/opt/hipSYCL/llvm/bin/clang++ cmake .. -DSYCL_IMPL=triSYCL -DTRISYCL_TBB=OFF -DTRISYCL_OMP=ON -DTRISYCL_INCLUDE_DIR=/tmp/triSYCL-master/include

Update the default symlink for the running suite to operate on.

In [14]:
! rm -r ./benchmarks
! ln -s ./trisycl-cpu-benchmarks ./benchmarks

Run the benchmarks on CPU:

In [ ]:
! rm ./sycl-bench.csv
! ./run-suite default
rm: cannot remove './sycl-bench.csv': No such file or directory
Using test profile: default


##################################################
Processing sf
##################################################
0.0 10800.0
__________________________________________________

sf --num-runs=50 --output=./sycl-bench.csv --size=1024 --local=256
==> Benchmark run finished in 0.09319927199976519 s


##################################################
Processing correlation
##################################################
0.0 10800.0
__________________________________________________

correlation --num-runs=50 --output=./sycl-bench.csv --size=1024 --local=256
==> Benchmark run finished in 41.76671342604095 s


##################################################
Processing mvt
##################################################
0.0 10800.0
__________________________________________________

mvt --num-runs=50 --output=./sycl-bench.csv --size=16384 --local=256
==> Benchmark run finished in 76.5694303069613 s


##################################################
Processing arith
##################################################
0.0 10800.0
__________________________________________________

arith --num-runs=50 --output=./sycl-bench.csv --size=1024 --local=256
==> Benchmark run finished in 0.14798519801115617 s


##################################################
Processing lin_reg_error
##################################################
0.0 10800.0
__________________________________________________

lin_reg_error --num-runs=50 --output=./sycl-bench.csv --size=65536 --local=256
==> Benchmark run finished in 485.68193354899995 s


##################################################
Processing 3DConvolution
##################################################
0.0 10800.0
__________________________________________________

3DConvolution --num-runs=50 --output=./sycl-bench.csv --size=1024 --local=256
==> Benchmark run finished in 1316.626640025992 s


##################################################
Processing gramschmidt
##################################################
0.0 10800.0
__________________________________________________

gramschmidt --num-runs=50 --output=./sycl-bench.csv --size=1024 --local=256
==> Benchmark run finished in 76.95679561496945 s


##################################################
Processing atax
##################################################
0.0 10800.0
__________________________________________________

atax --num-runs=50 --output=./sycl-bench.csv --size=4096 --local=256
==> Benchmark run finished in 5.112743796024006 s


##################################################
Processing 2mm
##################################################
0.0 10800.0
__________________________________________________

2mm --num-runs=50 --output=./sycl-bench.csv --size=1024 --local=256
==> Benchmark run finished in 336.46451737894677 s


##################################################
Processing gesummv
##################################################
0.0 10800.0
__________________________________________________

gesummv --num-runs=50 --output=./sycl-bench.csv --size=16384 --local=256
==> Benchmark run finished in 99.81492757296655 s


##################################################
Processing matmulchain
##################################################
0.0 10800.0
__________________________________________________

matmulchain --num-runs=50 --output=./sycl-bench.csv --size=1024 --local=256
==> Benchmark run finished in 39.002122051955666 s


##################################################
Processing vec_add
##################################################
0.0 10800.0
__________________________________________________

vec_add --num-runs=50 --output=./sycl-bench.csv --size=1048576 --local=256
==> Benchmark run finished in 2.629396017990075 s


##################################################
Processing bicg
##################################################
0.0 10800.0
__________________________________________________

bicg --num-runs=50 --output=./sycl-bench.csv --size=16384 --local=256
==> Benchmark run finished in 89.36208289995557 s


##################################################
Processing dag_task_throughput_independent
##################################################
0.0 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --size=1024 --local=256
==> Benchmark run finished in 17.746268673974555 s
17.746268673974555 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --size=2048 --local=256
==> Benchmark run finished in 43.715150523988996 s
43.715150523988996 10800.0
__________________________________________________

dag_task_throughput_independent --num-runs=50 --output=./sycl-bench.csv --size=4096 --local=256

Remove the first comment character from the very first line in the results -- we want to use these column names in the analysis. Then rename the results to something more descriptive.

In [42]:
! tail -c +2 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-gold-trisycl-cpu.csv

Scaling of Vector Addition

Given a workgroup size of 256, the global work ranges from 256 (2^8) to 1048576 (2^20) increasing by a power of two in each of these contrived vector addition tests. Thus on this 32-core (hyperthreaded architecture) Gold CPU we should see a plateau in performance at 8192 () and onwards -- assuming no overheads in the SYCL implementation.

In [498]:
import pandas

vec = pandas.read_csv('./vec_add_ComputeCPP-cpu.csv',comment='#')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_dpc++-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_hipsycl-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_trisycl-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_ComputeCPP-opencl.csv',comment='#'),how='outer')

print("using sample size of:",len(list(map(float,vec['run-time-samples'][0].split()))),"elements per data-point")    
using sample size of: 50 elements per data-point

Restructure dataframe to split up the run-time-samples into separate run-time-sample by duplicating each row with a unique sample -- this is for R to do the heavy-lifting by generating the box-and-whisker plots and summary statistics.

In [499]:
from tqdm import tqdm


outdat = pandas.DataFrame()

for index, row in tqdm(vec.iterrows(),total=vec.shape[0]):
    samples = row['run-time-samples']
    x = samples.split(' ')
    for y in x:
        tmprow = row
        tmprow['run-time-sample'] = float(y)
        outdat = outdat.append(tmprow)
    
outdat = outdat.drop(columns=['run-time-samples'])

vec = outdat
100%|██████████| 260/260 [01:41<00:00,  2.57it/s]
In [500]:
vec
Out[500]:
Benchmark name Verification device-name kernel-time-mean kernel-time-median kernel-time-min kernel-time-samples kernel-time-stddev kernel-time-throughput local-size problem-size run-time-mean run-time-median run-time-min run-time-sample run-time-stddev run-time-throughput sycl-implementation throughput-metric
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000058 0.000053 0.000044 0.000044 0.000020 NaN ComputeCpp NaN
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000058 0.000053 0.000044 0.000044 0.000020 NaN ComputeCpp NaN
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000058 0.000053 0.000044 0.000046 0.000020 NaN ComputeCpp NaN
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000058 0.000053 0.000044 0.000048 0.000020 NaN ComputeCpp NaN
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000058 0.000053 0.000044 0.000049 0.000020 NaN ComputeCpp NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
259 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.000939 0.000728 0.000652 0.000772 0.001524 NaN ComputeCpp NaN
259 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.000939 0.000728 0.000652 0.000773 0.001524 NaN ComputeCpp NaN
259 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.000939 0.000728 0.000652 0.000784 0.001524 NaN ComputeCpp NaN
259 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.000939 0.000728 0.000652 0.000792 0.001524 NaN ComputeCpp NaN
259 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.000939 0.000728 0.000652 0.011498 0.001524 NaN ComputeCpp NaN

13000 rows × 19 columns

We also add the Runtime variable which is named according to the sycl-implementation and device-name. This is needed because all CPU backends cannot query the device name.

In [501]:
def clear_up_runtime (row):
    if row['device-name'] == "Device 66af" and row['sycl-implementation'] == "hipSYCL":
        return "Vega 20 - hipSYCL/ROCm" # (gfx906)
    elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "hipSYCL":
        return "Tesla P100 - hipSYCL/CUDA"
    elif row['device-name'] == "hipCPU OpenMP host device" and row['sycl-implementation'] == "hipSYCL":
        return "Xeon Gold - hipSYCL/OpenMP"    
    elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "LLVM CUDA (Codeplay)":
        return "Tesla P100 - DPC++/CUDA"
    elif row['device-name'] == "SYCL host device" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
        return "Xeon Gold - DPC++/CPU"

    
    #todo: generate and check this one:
    elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
        return "Xeon Gold - DPC++/OpenCL"
    
    
    elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'ComputeCpp':
        return "Xeon Gold - ComputeCpp/OpenCL"
    elif row['device-name'] == "Host Device" and row['sycl-implementation'] == 'ComputeCpp':
        return "Xeon Gold - ComputeCpp/CPU"
    elif row['device-name'] == 'unknown' and row['sycl-implementation'] == 'triSYCL':
        return "Xeon Gold - triSYCL/OpenMP"
    else:
        print(device)
    
vec['Runtime'] = vec.apply (lambda row: clear_up_runtime(row), axis=1)

Convert these runtimes to factors.

In [502]:
%%R -i vec -o vec

vec$Runtime <- as.factor(vec$Runtime)

Permanently assign colour to each runtime -- to avoid confusion and colour reuse when plots are broken down into types of accelerator.

In [503]:
%%R -i vec -o colour_scale

# While viridis is a great colour palette, we need high contrast between neighouring elements --like the rainbow palette-- but still need to be colour-blind friendly.
#library('viridisLite')
#colours <- viridisLite::viridis(length(unique(all_res$Runtime)))

#library(RColorBrewer)
#colours <- brewer.pal(length(unique(all_res$Runtime)),'Dark2')

library(scales)
colours <- hue_pal()(length(unique(vec$Runtime)))

names(colours) <- levels(vec$Runtime)
colour_scale <- scale_colour_manual(name = "Runtime",values = colours)
In [514]:
vec
Out[514]:
Benchmark name Verification device-name kernel-time-mean kernel-time-median kernel-time-min kernel-time-samples kernel-time-stddev kernel-time-throughput local-size problem-size run-time-mean run-time-median run-time-min run-time-sample run-time-stddev run-time-throughput sycl-implementation throughput-metric Runtime
1 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000058 0.000053 0.000044 0.000044 0.000020 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
2 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000058 0.000053 0.000044 0.000044 0.000020 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
3 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000058 0.000053 0.000044 0.000046 0.000020 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
4 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000058 0.000053 0.000044 0.000048 0.000020 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
5 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000058 0.000053 0.000044 0.000049 0.000020 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
12996 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.000939 0.000728 0.000652 0.000772 0.001524 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL
12997 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.000939 0.000728 0.000652 0.000773 0.001524 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL
12998 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.000939 0.000728 0.000652 0.000784 0.001524 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL
12999 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.000939 0.000728 0.000652 0.000792 0.001524 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL
13000 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.000939 0.000728 0.000652 0.011498 0.001524 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL

13000 rows × 20 columns

In [533]:
%%R -i vec -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')

names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)


p1 <- ggplot(vec, aes(x=problem.size, y=run.time.sample, colour=Runtime, group = interaction(problem.size, Runtime))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Problem Size", y="Execution Time (s)") + geom_boxplot(position="identity",alpha=0.5) + colour_scale + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Problem Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
print(p1)

And under a log-transform on the y-axis:

In [535]:
%%R -i vec -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')

names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)


p1 <- ggplot(vec, aes(x=problem.size, y=run.time.sample, colour=Runtime, group = interaction(problem.size, Runtime))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Problem Size", y="Execution Time (s)") + geom_boxplot(position="identity",alpha=0.5) + colour_scale + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Problem Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time", breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))
print(p1)

Let's try again with more runtime data... that may clear up any trends -- too speculative with just 50 repeats. Run the following script to collect the results --and feel free to change the --num-runs in the run suite to whatever you'd prefer. The following data was generated with 1000 repeats.

In [ ]:
! ./run_vec_add_tests.sh
In [549]:
import pandas

vec = pandas.read_csv('./vec_add_tests_ComputeCPP-cpu.csv',comment='#')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_tests_dpc++-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_tests_hipsycl-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_tests_trisycl-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_tests_ComputeCPP-opencl.csv',comment='#'),how='outer')

print("using sample size of:",len(list(map(float,vec['run-time-samples'][0].split()))),"elements per data-point")    
using sample size of: 1000 elements per data-point

Restructure dataframe to split up the run-time-samples into separate run-time-sample by duplicating each row with a unique sample -- this is for R to do the heavy-lifting by generating the box-and-whisker plots and summary statistics.

In [550]:
from tqdm import tqdm


outdat = pandas.DataFrame()

for index, row in tqdm(vec.iterrows(),total=vec.shape[0]):
    samples = row['run-time-samples']
    x = samples.split(' ')
    for y in x:
        tmprow = row
        tmprow['run-time-sample'] = float(y)
        outdat = outdat.append(tmprow)
    
outdat = outdat.drop(columns=['run-time-samples'])

vec = outdat
100%|██████████| 260/260 [2:34:48<00:00, 35.73s/it]  
In [551]:
vec
Out[551]:
Benchmark name Verification device-name kernel-time-mean kernel-time-median kernel-time-min kernel-time-samples kernel-time-stddev kernel-time-throughput local-size problem-size run-time-mean run-time-median run-time-min run-time-sample run-time-stddev run-time-throughput sycl-implementation throughput-metric
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000127 0.000049 0.000034 0.000034 0.000586 NaN ComputeCpp NaN
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000127 0.000049 0.000034 0.000034 0.000586 NaN ComputeCpp NaN
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000127 0.000049 0.000034 0.000034 0.000586 NaN ComputeCpp NaN
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000127 0.000049 0.000034 0.000034 0.000586 NaN ComputeCpp NaN
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000127 0.000049 0.000034 0.000034 0.000586 NaN ComputeCpp NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
259 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.001072 0.000912 0.000821 0.014303 0.001418 NaN ComputeCpp NaN
259 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.001072 0.000912 0.000821 0.015256 0.001418 NaN ComputeCpp NaN
259 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.001072 0.000912 0.000821 0.015294 0.001418 NaN ComputeCpp NaN
259 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.001072 0.000912 0.000821 0.016073 0.001418 NaN ComputeCpp NaN
259 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.001072 0.000912 0.000821 0.016149 0.001418 NaN ComputeCpp NaN

260000 rows × 19 columns

We also add the Runtime variable which is named according to the sycl-implementation and device-name. This is needed because all CPU backends cannot query the device name.

In [552]:
def clear_up_runtime (row):
    if row['device-name'] == "Device 66af" and row['sycl-implementation'] == "hipSYCL":
        return "Vega 20 - hipSYCL/ROCm" # (gfx906)
    elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "hipSYCL":
        return "Tesla P100 - hipSYCL/CUDA"
    elif row['device-name'] == "hipCPU OpenMP host device" and row['sycl-implementation'] == "hipSYCL":
        return "Xeon Gold - hipSYCL/OpenMP"    
    elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "LLVM CUDA (Codeplay)":
        return "Tesla P100 - DPC++/CUDA"
    elif row['device-name'] == "SYCL host device" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
        return "Xeon Gold - DPC++/CPU"

    
    #todo: generate and check this one:
    elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
        return "Xeon Gold - DPC++/OpenCL"
    
    
    elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'ComputeCpp':
        return "Xeon Gold - ComputeCpp/OpenCL"
    elif row['device-name'] == "Host Device" and row['sycl-implementation'] == 'ComputeCpp':
        return "Xeon Gold - ComputeCpp/CPU"
    elif row['device-name'] == 'unknown' and row['sycl-implementation'] == 'triSYCL':
        return "Xeon Gold - triSYCL/OpenMP"
    else:
        print(device)
    
vec['Runtime'] = vec.apply (lambda row: clear_up_runtime(row), axis=1)

Convert these runtimes to factors.

In [553]:
%%R -i vec -o vec

vec$Runtime <- as.factor(vec$Runtime)

Permanently assign colour to each runtime -- to avoid confusion and colour reuse when plots are broken down into types of accelerator.

In [554]:
%%R -i vec -o colour_scale

# While viridis is a great colour palette, we need high contrast between neighouring elements --like the rainbow palette-- but still need to be colour-blind friendly.
#library('viridisLite')
#colours <- viridisLite::viridis(length(unique(all_res$Runtime)))

#library(RColorBrewer)
#colours <- brewer.pal(length(unique(all_res$Runtime)),'Dark2')

library(scales)
colours <- hue_pal()(length(unique(vec$Runtime)))

names(colours) <- levels(vec$Runtime)
colour_scale <- scale_colour_manual(name = "Runtime",values = colours)
In [555]:
vec
Out[555]:
Benchmark name Verification device-name kernel-time-mean kernel-time-median kernel-time-min kernel-time-samples kernel-time-stddev kernel-time-throughput local-size problem-size run-time-mean run-time-median run-time-min run-time-sample run-time-stddev run-time-throughput sycl-implementation throughput-metric Runtime
1 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000127 0.000049 0.000034 0.000034 0.000586 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
2 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000127 0.000049 0.000034 0.000034 0.000586 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
3 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000127 0.000049 0.000034 0.000034 0.000586 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
4 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000127 0.000049 0.000034 0.000034 0.000586 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
5 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000127 0.000049 0.000034 0.000034 0.000586 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
259996 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.001072 0.000912 0.000821 0.014303 0.001418 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL
259997 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.001072 0.000912 0.000821 0.015256 0.001418 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL
259998 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.001072 0.000912 0.000821 0.015294 0.001418 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL
259999 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.001072 0.000912 0.000821 0.016073 0.001418 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL
260000 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 1048576.0 0.001072 0.000912 0.000821 0.016149 0.001418 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL

260000 rows × 20 columns

In [558]:
%%R -i vec -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')

names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)


p1 <- ggplot(vec, aes(x=problem.size, y=run.time.sample, colour=Runtime, group = interaction(problem.size, Runtime))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Problem Size", y="Execution Time (s)") + geom_boxplot(position="identity",alpha=0.5,outlier.size = 0.01) + colour_scale + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Problem Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
print(p1)

And under a log-transform on the y-axis:

In [559]:
%%R -i vec -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')

names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)


p1 <- ggplot(vec, aes(x=problem.size, y=run.time.sample, colour=Runtime, group = interaction(problem.size, Runtime))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Problem Size", y="Execution Time (s)") + geom_boxplot(position="identity",alpha=0.5, outlier.size = 0.01) + colour_scale + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Problem Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time", breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))
print(p1)

Vec-Add over a wider range of Problem Sizes

Repeats (--run-runs) were set to 250 and the problem sizes incrementally increasing over the range of 2^8 to 2^30, to examine the longer term effects, such as viewing how performance tapers off between SYCL backend and implementation.

Generated with the following script:

In [ ]:
! ./run_vec_add_big_tests.sh
In [607]:
import pandas

vec = pandas.read_csv('./vec_add_big_tests_ComputeCPP-cpu.csv',comment='#')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_big_tests_dpc++-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_big_tests_hipsycl-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_big_tests_trisycl-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_big_tests_ComputeCPP-opencl.csv',comment='#'),how='outer')

print("using sample size of:",len(list(map(float,vec['run-time-samples'][0].split()))),"elements per data-point")    
using sample size of: 250 elements per data-point

Restructure dataframe to split up the run-time-samples into separate run-time-sample by duplicating each row with a unique sample -- this is for R to do the heavy-lifting by generating the box-and-whisker plots and summary statistics.

In [608]:
from tqdm import tqdm


outdat = pandas.DataFrame()

for index, row in tqdm(vec.iterrows(),total=vec.shape[0]):
    samples = row['run-time-samples']
    x = samples.split(' ')
    for y in x:
        tmprow = row
        tmprow['run-time-sample'] = float(y)
        outdat = outdat.append(tmprow)
    
outdat = outdat.drop(columns=['run-time-samples'])

vec = outdat
100%|██████████| 452/452 [34:00<00:00,  4.51s/it]
In [609]:
vec
Out[609]:
Benchmark name Verification device-name kernel-time-mean kernel-time-median kernel-time-min kernel-time-samples kernel-time-stddev kernel-time-throughput local-size problem-size run-time-mean run-time-median run-time-min run-time-sample run-time-stddev run-time-throughput sycl-implementation throughput-metric
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000159 0.000051 0.000035 0.000035 0.000746 NaN ComputeCpp NaN
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000159 0.000051 0.000035 0.000035 0.000746 NaN ComputeCpp NaN
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000159 0.000051 0.000035 0.000036 0.000746 NaN ComputeCpp NaN
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000159 0.000051 0.000035 0.000037 0.000746 NaN ComputeCpp NaN
0 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000159 0.000051 0.000035 0.000037 0.000746 NaN ComputeCpp NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
451 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 536870912.0 0.378730 0.336295 0.262811 0.571703 0.100618 NaN ComputeCpp NaN
451 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 536870912.0 0.378730 0.336295 0.262811 0.574669 0.100618 NaN ComputeCpp NaN
451 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 536870912.0 0.378730 0.336295 0.262811 0.785879 0.100618 NaN ComputeCpp NaN
451 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 536870912.0 0.378730 0.336295 0.262811 0.968766 0.100618 NaN ComputeCpp NaN
451 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 536870912.0 0.378730 0.336295 0.262811 1.020180 0.100618 NaN ComputeCpp NaN

113000 rows × 19 columns

We also add the Runtime variable which is named according to the sycl-implementation and device-name. This is needed because all CPU backends cannot query the device name.

In [610]:
def clear_up_runtime (row):
    if row['device-name'] == "Device 66af" and row['sycl-implementation'] == "hipSYCL":
        return "Vega 20 - hipSYCL/ROCm" # (gfx906)
    elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "hipSYCL":
        return "Tesla P100 - hipSYCL/CUDA"
    elif row['device-name'] == "hipCPU OpenMP host device" and row['sycl-implementation'] == "hipSYCL":
        return "Xeon Gold - hipSYCL/OpenMP"    
    elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "LLVM CUDA (Codeplay)":
        return "Tesla P100 - DPC++/CUDA"
    elif row['device-name'] == "SYCL host device" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
        return "Xeon Gold - DPC++/CPU"

    
    #todo: generate and check this one:
    elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
        return "Xeon Gold - DPC++/OpenCL"
    
    
    elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'ComputeCpp':
        return "Xeon Gold - ComputeCpp/OpenCL"
    elif row['device-name'] == "Host Device" and row['sycl-implementation'] == 'ComputeCpp':
        return "Xeon Gold - ComputeCpp/CPU"
    elif row['device-name'] == 'unknown' and row['sycl-implementation'] == 'triSYCL':
        return "Xeon Gold - triSYCL/OpenMP"
    else:
        print(device)
    
vec['Runtime'] = vec.apply (lambda row: clear_up_runtime(row), axis=1)

Convert these runtimes to factors.

In [611]:
%%R -i vec -o vec

vec$Runtime <- as.factor(vec$Runtime)

Permanently assign colour to each runtime -- to avoid confusion and colour reuse when plots are broken down into types of accelerator.

In [612]:
%%R -i vec -o colour_scale

# While viridis is a great colour palette, we need high contrast between neighouring elements --like the rainbow palette-- but still need to be colour-blind friendly.
#library('viridisLite')
#colours <- viridisLite::viridis(length(unique(all_res$Runtime)))

#library(RColorBrewer)
#colours <- brewer.pal(length(unique(all_res$Runtime)),'Dark2')

library(scales)
colours <- hue_pal()(length(unique(vec$Runtime)))

names(colours) <- levels(vec$Runtime)
colour_scale <- scale_colour_manual(name = "Runtime",values = colours)
In [613]:
vec
Out[613]:
Benchmark name Verification device-name kernel-time-mean kernel-time-median kernel-time-min kernel-time-samples kernel-time-stddev kernel-time-throughput local-size problem-size run-time-mean run-time-median run-time-min run-time-sample run-time-stddev run-time-throughput sycl-implementation throughput-metric Runtime
1 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000159 0.000051 0.000035 0.000035 0.000746 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
2 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000159 0.000051 0.000035 0.000035 0.000746 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
3 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000159 0.000051 0.000035 0.000036 0.000746 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
4 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000159 0.000051 0.000035 0.000037 0.000746 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
5 VectorAddition_int32 PASS Host Device NaN NaN NaN NaN NaN NaN 256.0 256.0 0.000159 0.000051 0.000035 0.000037 0.000746 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/CPU
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
112996 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 536870912.0 0.378730 0.336295 0.262811 0.571703 0.100618 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL
112997 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 536870912.0 0.378730 0.336295 0.262811 0.574669 0.100618 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL
112998 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 536870912.0 0.378730 0.336295 0.262811 0.785879 0.100618 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL
112999 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 536870912.0 0.378730 0.336295 0.262811 0.968766 0.100618 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL
113000 VectorAddition_fp64 PASS Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz NaN NaN NaN NaN NaN NaN 256.0 536870912.0 0.378730 0.336295 0.262811 1.020180 0.100618 NaN ComputeCpp NaN Xeon Gold - ComputeCpp/OpenCL

113000 rows × 20 columns

In [614]:
x = vec[(vec['Runtime'] == "Xeon Gold - ComputeCpp/OpenCL")]
x = x[x["problem-size"] == 256]
x = x[x['Benchmark name'] == "VectorAddition_int32"]
len(x) #why was the OpenCL version running the wrong test?
Out[614]:
250
In [615]:
%%R -i vec -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')

names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)


p1 <- ggplot(vec, aes(x=problem.size, y=run.time.sample, colour=Runtime, group = interaction(problem.size, Runtime))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Problem Size", y="Execution Time (s)") + geom_boxplot(position="identity",alpha=0.5,outlier.size = 0.01) + colour_scale + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Problem Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
print(p1)

And under a log-transform on the y-axis:

In [616]:
%%R -i vec -i colour_scale -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')

names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)


p1 <- ggplot(vec, aes(x=problem.size, y=run.time.sample, colour=Runtime, group = interaction(problem.size, Runtime))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Problem Size", y="Execution Time (s)") + geom_boxplot(position="identity",alpha=0.5, outlier.size = 0.01) + colour_scale + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Problem Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time", breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))
print(p1)

Note rerun these tests when the system is idle.

Local Work-groups

This outlines the experiment used to see the effect of local-workgroup sizes on different architectures. In particular, are the largest possible even division of tasks by the number of cores the most suited to CPU architectures reguardless of backend? This would mitigate the overhead in the backend languages scheduling policy. What about GPU backends -- does the optimal local workgroup size vary between backends despite using exactly the same architecture?

Unfortunately, the Vector Addition benchmark is in the BKP SYCL Computational Construct and thus does not support setting local workgroup sizes. Instead we add a new version of Vector Addition vec_add_wgp with this functionality.

To generate this data on hipSYCL with OpenMP:

In [121]:
#the default/baseline sycl bkp -- where no workgroups are specified
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add --size=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
#the serial baseline version -- uses no parallelism
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_serial --size=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
#wgp kernels with the OpenMP Number of threads statically set from the command line -- we should know more about the parallelism available in the application than the runtime.
#This set-num-threads should be no greater than the physical cores available in the system.
! OMP_NUM_THREADS=1 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=2 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=536870912 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=4 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=268435456 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=8 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=134217728 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=16 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=67108864 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=32 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=33554432 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
#wgp kernels without overriding the core utilization in OpenMP.
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=32 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=64 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=128 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=256 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=512 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=1024 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=2048 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=4096 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=8192 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=16384 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=32768 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=65536 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=131072 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=262144 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=524288 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=1048576 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=2097152 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=4194304 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=8388608 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=16777216 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=33554432 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=67108864 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=134217728 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=268435456 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=536870912 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
Problem size=1073741824 # Work Groups=33554432 of 32 big
Problem size=1073741824 # Work Groups=33554432 of 32 big
Problem size=1073741824 # Work Groups=33554432 of 32 big
Problem size=1073741824 # Work Groups=33554432 of 32 big
Problem size=1073741824 # Work Groups=33554432 of 32 big
^C

To generate this data on triSYCL with OpenMP:

In [121]:
#the default/baseline sycl bkp -- where no workgroups are specified
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add --size=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
#the serial baseline version -- uses no parallelism
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_serial --size=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
#wgp kernels with the OpenMP Number of threads statically set from the command line -- we should know more about the parallelism available in the application than the runtime.
#This set-num-threads should be no greater than the physical cores available in the system.
! OMP_NUM_THREADS=1 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=2 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=536870912 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=4 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=268435456 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=8 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=134217728 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=16 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=67108864 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=32 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=33554432 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
#wgp kernels without overriding the core utilization in OpenMP.
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=32 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=64 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=128 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=256 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=512 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=1024 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=2048 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=4096 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=8192 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=16384 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=32768 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=65536 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=131072 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=262144 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=524288 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=1048576 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=2097152 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=4194304 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=8388608 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=16777216 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=33554432 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=67108864 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=134217728 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=268435456 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=536870912 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
Problem size=1073741824 # Work Groups=33554432 of 32 big
Problem size=1073741824 # Work Groups=33554432 of 32 big
Problem size=1073741824 # Work Groups=33554432 of 32 big
Problem size=1073741824 # Work Groups=33554432 of 32 big
Problem size=1073741824 # Work Groups=33554432 of 32 big
^C
In [158]:
import pandas
vec = pandas.read_csv('./local_workgroup_overheads_with_sycl_implementations.csv', comment='#')

#Rearrange and clean the data
from tqdm import tqdm
outdat = pandas.DataFrame()

for index, row in tqdm(vec.iterrows(),total=vec.shape[0]):
    samples = row['run-time-samples']
    x = samples.split(' ')
    for y in x:
        tmprow = row
        tmprow['run-time-sample'] = float(y)
        outdat = outdat.append(tmprow)
    
outdat = outdat.drop(columns=['run-time-samples'])
vec = outdat
100%|██████████| 34/34 [00:36<00:00,  1.07s/it]
In [159]:
#import pandas
#vec = pandas.read_csv('./split_test.csv',comment='#')
#vec['local-size'].unique()
def clear_up_runtime (row):
    return str(row['sycl-implementation'])
    
vec['runtime'] = vec.apply (lambda row: clear_up_runtime(row), axis=1)

vec = vec.drop(columns=['Verification'])
In [160]:
vec
Out[160]:
Benchmark name device-name kernel-time-mean kernel-time-median kernel-time-min kernel-time-samples kernel-time-stddev kernel-time-throughput local-size problem-size run-time-mean run-time-median run-time-min run-time-sample run-time-stddev run-time-throughput sycl-implementation throughput-metric runtime
0 VectorAddition_int32 hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 1.073742e+09 1.073742e+09 1.028220 1.003808 0.999748 0.999748 0.084924 NaN hipSYCL-serial NaN hipSYCL-serial
0 VectorAddition_int32 hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 1.073742e+09 1.073742e+09 1.028220 1.003808 0.999748 0.999770 0.084924 NaN hipSYCL-serial NaN hipSYCL-serial
0 VectorAddition_int32 hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 1.073742e+09 1.073742e+09 1.028220 1.003808 0.999748 0.999804 0.084924 NaN hipSYCL-serial NaN hipSYCL-serial
0 VectorAddition_int32 hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 1.073742e+09 1.073742e+09 1.028220 1.003808 0.999748 0.999807 0.084924 NaN hipSYCL-serial NaN hipSYCL-serial
0 VectorAddition_int32 hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 1.073742e+09 1.073742e+09 1.028220 1.003808 0.999748 0.999823 0.084924 NaN hipSYCL-serial NaN hipSYCL-serial
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
33 VectorAddition_int32 hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 1.677722e+07 1.073742e+09 0.343338 0.321656 0.232281 0.485182 0.063682 NaN hipSYCL-openmp NaN hipSYCL-openmp
33 VectorAddition_int32 hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 1.677722e+07 1.073742e+09 0.343338 0.321656 0.232281 0.519882 0.063682 NaN hipSYCL-openmp NaN hipSYCL-openmp
33 VectorAddition_int32 hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 1.677722e+07 1.073742e+09 0.343338 0.321656 0.232281 0.525771 0.063682 NaN hipSYCL-openmp NaN hipSYCL-openmp
33 VectorAddition_int32 hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 1.677722e+07 1.073742e+09 0.343338 0.321656 0.232281 0.529801 0.063682 NaN hipSYCL-openmp NaN hipSYCL-openmp
33 VectorAddition_int32 hipCPU OpenMP host device NaN NaN NaN NaN NaN NaN 1.677722e+07 1.073742e+09 0.343338 0.321656 0.232281 0.583167 0.063682 NaN hipSYCL-openmp NaN hipSYCL-openmp

3400 rows × 19 columns

In [161]:
%load_ext rpy2.ipython
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
In [162]:
%%R -i vec -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')

names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(vec, aes(x=local.size, y=run.time.sample, colour=sycl.implementation, group = interaction(local.size, sycl.implementation))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Local Size", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time") + geom_hline(yintercept=median(subset(vec, sycl.implementation == "hipSYCL-serial")$run.time.sample), linetype="dashed", color = "blue")
ggsave('hipsycl-openmp-workgroup-size-performance.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)

print(p1)
In [163]:
%%R -i vec -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')

names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)

vec$num.workgroups = as.integer(vec$problem.size / vec$local.size)

p1 <- ggplot(vec, aes(x=num.workgroups, y=run.time.sample, colour=sycl.implementation, group = interaction(local.size, sycl.implementation))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "VectorAddition Implementation", x="Number of Workgroups", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Number Of Workgroups", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time") + geom_hline(yintercept=median(subset(vec, sycl.implementation == "hipSYCL-serial")$run.time.sample), linetype="dashed", color = "blue")

ggsave('hipsycl-openmp-and-using-workgroups.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p1)

triSYCL - with OpenMP

In [130]:
import pandas
vec = pandas.read_csv('./trisycl.csv', comment='#')

#Rearrange and clean the data
from tqdm import tqdm
outdat = pandas.DataFrame()

for index, row in tqdm(vec.iterrows(),total=vec.shape[0]):
    samples = row['run-time-samples']
    x = samples.split(' ')
    for y in x:
        tmprow = row
        tmprow['run-time-sample'] = float(y)
        outdat = outdat.append(tmprow)
    
outdat = outdat.drop(columns=['run-time-samples'])
vec = outdat

def clear_up_runtime (row):
    return str(row['sycl-implementation'])
    
vec['runtime'] = vec.apply (lambda row: clear_up_runtime(row), axis=1)

vec = vec.drop(columns=['Verification'])
100%|██████████| 8/8 [00:00<00:00, 16.43it/s]
In [156]:
%%R -i vec -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')

names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(vec, aes(x=local.size, y=run.time.sample, colour=sycl.implementation, group = interaction(local.size, sycl.implementation))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Local Size", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time") + geom_hline(yintercept=median(subset(vec, sycl.implementation == "triSYCL-serial")$run.time.sample), linetype="dashed", color = "blue")
ggsave('trisycl-openmp-workgroup-size-performance.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p1)

vec$num.workgroups = as.integer(vec$problem.size / vec$local.size)

p2 <- ggplot(vec, aes(x=num.workgroups, y=run.time.sample, colour=sycl.implementation, group = interaction(local.size, sycl.implementation))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "VectorAddition Implementation", x="Number of Workgroups", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Number Of Workgroups", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time") + geom_hline(yintercept=median(subset(vec, sycl.implementation == "triSYCL-serial")$run.time.sample), linetype="dashed", color = "blue")
ggsave('trisycl-openmp-and-using-workgroups.pdf', p2, device="pdf",width=8.3, height=11.7, units="in",dpi=320)

print(p2)

WGP

In [172]:
import pandas
vec = pandas.read_csv('./vec_add_wgp_hipsycl-cpu.csv', comment='#')

#Rearrange and clean the data
from tqdm import tqdm
outdat = pandas.DataFrame()

for index, row in tqdm(vec.iterrows(),total=vec.shape[0]):
    samples = row['run-time-samples']
    x = samples.split(' ')
    for y in x:
        tmprow = row
        tmprow['run-time-sample'] = float(y)
        outdat = outdat.append(tmprow)
    
outdat = outdat.drop(columns=['run-time-samples'])
vec = outdat

def clear_up_runtime (row):
    return str(row['sycl-implementation'])
    
vec['runtime'] = vec.apply (lambda row: clear_up_runtime(row), axis=1)

vec = vec.drop(columns=['Verification'])
100%|██████████| 8/8 [00:03<00:00,  2.49it/s]
In [174]:
%%R -i vec -w 8.3 -h 11.7 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')

names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)

p1 <- ggplot(vec, aes(x=local.size, y=run.time.sample, colour=sycl.implementation, group = interaction(local.size, sycl.implementation))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Local Size", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time") + expand_limits(y = 0) + geom_hline(yintercept=median(subset(vec, sycl.implementation == "triSYCL-serial")$run.time.sample), linetype="dashed", color = "blue")
ggsave('trisycl-openmp-workgroup-size-performance.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p1)

vec$num.workgroups = as.integer(vec$problem.size / vec$local.size)

p2 <- ggplot(vec, aes(x=num.workgroups, y=run.time.sample, colour=sycl.implementation, group = interaction(local.size, sycl.implementation))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "VectorAddition Implementation", x="Number of Workgroups", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Number Of Workgroups", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time") + expand_limits(y = 0) + geom_hline(yintercept=median(subset(vec, sycl.implementation == "triSYCL-serial")$run.time.sample), linetype="dashed", color = "blue")
ggsave('trisycl-openmp-and-using-workgroups.pdf', p2, device="pdf",width=8.3, height=11.7, units="in",dpi=320)

print(p2)

Memory access patterns on the CPU architecture can be traced via cache access and miss rates. We use perf as part of the linux-tools-generic package.

In [ ]:
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib perf stat -B -e cache-references,cache-misses ./matmul_bkp --size=1024 --num-runs=100

Matrix Multiplication

In [2]:
import pandas
#matmul = pandas.read_csv('./results/sycl-matmul-implementations-cache-miss-and-execution-times-of-cpu-based-sycl-on-xeon-gold-6134.csv', comment='#')
matmul = pandas.read_csv('./results/sycl-matmul-implementations-cache-miss-and-execution-times-sycl-on-xeon-gold-6134-and-p100.csv', comment='#')
In [3]:
matmul.loc[matmul['Local Size'] == '-','Local Size'] = 1
matmul["Local Size"] = pandas.to_numeric(matmul["Local Size"])
#remove entries without runtimes -- corresponds to tests that won't run or take too long to measure
matmul = matmul.dropna()


from tqdm import tqdm
outdat = pandas.DataFrame()

for index, row in tqdm(matmul.iterrows(),total=matmul.shape[0]):
    samples = row['Runtimes']
    try:
        x = samples.split(' ')
        for y in x:
            tmprow = row
            tmprow['Runtime'] = float(y)
            outdat = outdat.append(tmprow)
    except:
        print(samples)
    
outdat = outdat.drop(columns=['Runtimes'])
matmul = outdat
matmul
100%|██████████| 122/122 [01:03<00:00,  1.91it/s]
Out[3]:
Cache Miss Count Cache Reference Count Implementation and Backend Local Size Median Runtime Missed % of all Cache References Number Of Heavyweight Threads Runtime Total Application Execution Time (for 100 runs) Version
0 2387602.0 1.076362e+11 hipSYCL OpenMP 1.0 4.647662 0.002 1 4.645459 465.295673 Serial
0 2387602.0 1.076362e+11 hipSYCL OpenMP 1.0 4.647662 0.002 1 4.645638 465.295673 Serial
0 2387602.0 1.076362e+11 hipSYCL OpenMP 1.0 4.647662 0.002 1 4.646228 465.295673 Serial
0 2387602.0 1.076362e+11 hipSYCL OpenMP 1.0 4.647662 0.002 1 4.646532 465.295673 Serial
0 2387602.0 1.076362e+11 hipSYCL OpenMP 1.0 4.647662 0.002 1 4.646670 465.295673 Serial
... ... ... ... ... ... ... ... ... ... ...
186 45978777.0 7.063508e+07 hipSYCL ROCm 32.0 0.003401 65.093 32 0.003459 1.428545 WGP
186 45978777.0 7.063508e+07 hipSYCL ROCm 32.0 0.003401 65.093 32 0.003462 1.428545 WGP
186 45978777.0 7.063508e+07 hipSYCL ROCm 32.0 0.003401 65.093 32 0.003464 1.428545 WGP
186 45978777.0 7.063508e+07 hipSYCL ROCm 32.0 0.003401 65.093 32 0.003466 1.428545 WGP
186 45978777.0 7.063508e+07 hipSYCL ROCm 32.0 0.003401 65.093 32 0.003486 1.428545 WGP

12200 rows × 10 columns

In [4]:
df = matmul
df = df.replace('ComputeCPP OpenCL-Gold','ComputeCPP OpenCL - Gold')
df = df.replace('ComputeCPP pthreads','ComputeCPP pthreads - Gold')
df = df.replace('DPC++ CUDA','DPC++ CUDA - P100')
df = df.replace('DPC++ pthreads','DPC++ pthreads - Gold')
df = df.replace('hipSYCL CUDA','hipSYCL CUDA - P100')
df = df.replace('hipSYCL OpenMP','hipSYCL OpenMP - Gold')
df = df.replace('hipSYCL ROCm','hipSYCL ROCm - gfx906')
df = df.replace('triSYCL OpenMP','triSYCL OpenMP - Gold')
matmul = df
In [8]:
%%R -i matmul -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')
#library('ggpattern')

names(matmul) <- make.names(names(matmul), unique = FALSE, allow_ = TRUE)

#print(matmul$Implementation.and.Backend)
matmul$Implementation.and.Backend <- factor(matmul$Implementation.and.Backend, levels = c("ComputeCPP pthreads - Gold", "DPC++ pthreads - Gold", "hipSYCL OpenMP - Gold", "triSYCL OpenMP - Gold",  "ComputeCPP OpenCL - Gold", "DPC++ CUDA - P100","hipSYCL CUDA - P100", "hipSYCL ROCm - gfx906"))

#print(matmul$Version <- as.factor(matmul$Version))
#matmul$Local.Size=as.numeric(levels(matmul$Local.Size))[matmul$Local.Size]
#colour=Implementation.and.Backend, fill=Version, group = interaction(Local.Size, Implementation.and.Backend,Version)
#p1 <- ggplot(matmul, aes(x=Local.Size, y=Runtime,group=interaction(Local.Size,Implementation.and.Backend,Version))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Local Size", y="Execution Time (s)") + ggpattern::geom_boxplot_pattern(aes(pattern=Version,fill=Version,color=Version),pattern_spacing=0.02, position="dodge2",alpha=0.5,outlier.size = 0.01) + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Log(Execution Time[s])") + expand_limits(y = 0) + geom_hline(yintercept=median(subset(matmul, version == "Serial")$Runtime), linetype="dashed", color = "blue")

#p <- ggplot(mpg, aes(class, hwy)) +
#  geom_boxplot_pattern(
#    aes(
#      pattern      = class,
#      pattern_fill = class
#    ),
#    pattern_spacing = 0.03
#  ) +
#  theme_bw(18) +
#  labs(title = "ggpattern::geom_boxplot_pattern()") +
#  theme(legend.position = 'none') +
#  coord_fixed(1/8)
#print(p)
#ggsave('trisycl-openmp-workgroup-size-performance.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
p1 <- ggplot(matmul, aes(x=Local.Size, y=Runtime,colour=Version,group=interaction(Local.Size,Version))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6),legend.position="bottom") + expand_limits(y = 0) + labs(colour = "SYCL Parallel Construct", x="Local Size", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10(TeX("Execution Time $\\Log_{10}$(s)")) + expand_limits(y = 0,x = 1024) + facet_wrap( ~ Implementation.and.Backend, strip.position = "top", scales = "free_x", nrow=2) +
    geom_hline(yintercept=median(subset(matmul, Version == "Serial")$Runtime), linetype="dashed", color = "turquoise3")

#print(median(subset(matmul, Version == "Serial")$Runtime))
ggsave('matmul-local-performance.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)

print(p1)

Alternatively, the number of workgroups -- inverse to local workgroup size.

In [93]:
%%R -i matmul -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')
library('ggpattern')

names(matmul) <- make.names(names(matmul), unique = FALSE, allow_ = TRUE)
matmul$Implementation.and.Backend <- factor(matmul$Implementation.and.Backend, levels = c("ComputeCPP pthreads - Gold", "DPC++ pthreads - Gold", "hipSYCL OpenMP - Gold", "triSYCL OpenMP - Gold",  "ComputeCPP OpenCL - Gold", "DPC++ CUDA - P100","hipSYCL CUDA - P100", "hipSYCL ROCm - gfx906"))
matmul$Number.of.Workgroups = as.integer(1024/matmul$Local.Size)
p2 <- ggplot(matmul, aes(x=Number.of.Workgroups, y=Runtime,colour=Version,group=interaction(Local.Size,Version))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Execution Construct", x="Local Size", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + scale_x_log10("Number of Local Workgroups", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10(TeX("Execution Time $\\Log_{10}$(s)")) + expand_limits(y = 0) + facet_wrap( ~ Implementation.and.Backend, strip.position = "top", scales = "free_x", nrow=2)    
ggsave('matmul-number-of-workgroups-performance.pdf', p2, device="pdf",width=8.3, height=11.7, units="in",dpi=320)

print(p2)

Percentage of Missed Cache References:

In [385]:
%%R -i matmul -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')
library('ggpattern')
names(matmul) <- make.names(names(matmul), unique = FALSE, allow_ = TRUE)
matmul$Implementation.and.Backend <- factor(matmul$Implementation.and.Backend, levels = c("ComputeCPP pthreads - Gold", "DPC++ pthreads - Gold", "hipSYCL OpenMP - Gold", "triSYCL OpenMP - Gold",  "ComputeCPP OpenCL - Gold", "DPC++ CUDA - P100","hipSYCL CUDA - P100", "hipSYCL ROCm - gfx906"))

p1 <- ggplot(matmul, aes(x=Local.Size, y=Missed...of.all.Cache.References,colour=Version,group=interaction(Local.Size,Version))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6),legend.position="bottom") + expand_limits(y = 0) + labs(colour = "SYCL Parallelism Construct", x="Local Size", y="Missed (%) of Cache References") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + expand_limits(y = 0) + facet_wrap( ~ Implementation.and.Backend, strip.position = "top", scales = "free_x", nrow=2) + geom_hline(yintercept=median(subset(matmul, version == "Serial")$Runtime), linetype="dashed", color = "blue")
ggsave('matmul-local-cache-miss.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p1)

Absolute Number of Cache References -- for 100 repeats/samples:

In [101]:
%%R -i matmul -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')
library('ggpattern')
names(matmul) <- make.names(names(matmul), unique = FALSE, allow_ = TRUE)
matmul$Implementation.and.Backend <- factor(matmul$Implementation.and.Backend, levels = c("ComputeCPP pthreads - Gold", "DPC++ pthreads - Gold", "hipSYCL OpenMP - Gold", "triSYCL OpenMP - Gold",  "ComputeCPP OpenCL - Gold", "DPC++ CUDA - P100","hipSYCL CUDA - P100", "hipSYCL ROCm - gfx906"))

p1 <- ggplot(matmul, aes(x=Local.Size, y=Cache.Reference.Count,colour=Version,group=interaction(Local.Size,Version))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Execution Construct", x="Local Size", y="Number of Cache References") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + expand_limits(y = 0) + facet_wrap( ~ Implementation.and.Backend, strip.position = "top", scales = "free_x", nrow=2) + geom_hline(yintercept=median(subset(matmul, version == "Serial")$Runtime), linetype="dashed", color = "blue")
ggsave('matmul-local-total-cache-references.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p1)

Absolute Number of Cache Misses -- for 100 repeats/samples:

In [102]:
%%R -i matmul -w 11.7 -h 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')
library('ggpattern')
names(matmul) <- make.names(names(matmul), unique = FALSE, allow_ = TRUE)
matmul$Implementation.and.Backend <- factor(matmul$Implementation.and.Backend, levels = c("ComputeCPP pthreads - Gold", "DPC++ pthreads - Gold", "hipSYCL OpenMP - Gold", "triSYCL OpenMP - Gold",  "ComputeCPP OpenCL - Gold", "DPC++ CUDA - P100","hipSYCL CUDA - P100", "hipSYCL ROCm - gfx906"))

p1 <- ggplot(matmul, aes(x=Local.Size, y=Cache.Miss.Count,colour=Version,group=interaction(Local.Size,Version))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Execution Construct", x="Local Size", y="Number of Cache Misses") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + expand_limits(y = 0) + facet_wrap( ~ Implementation.and.Backend, strip.position = "top", scales = "free_x", nrow=2) + geom_hline(yintercept=median(subset(matmul, version == "Serial")$Runtime), linetype="dashed", color = "blue")
ggsave('matmul-local-total-cache-miss.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p1)

Percentage of Missed Cache References for just the CPU:

In [10]:
%%R -i matmul -w 11.7 -h 4.15 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')
library('scales')
#library('ggpattern')
names(matmul) <- make.names(names(matmul), unique = FALSE, allow_ = TRUE)
matmul$Implementation.and.Backend <- factor(matmul$Implementation.and.Backend, levels = c("ComputeCPP pthreads - Gold", "DPC++ pthreads - Gold", "hipSYCL OpenMP - Gold", "triSYCL OpenMP - Gold",  "ComputeCPP OpenCL - Gold", "DPC++ CUDA - P100","hipSYCL CUDA - P100", "hipSYCL ROCm - gfx906"))

matmul <- subset(matmul,Implementation.and.Backend!="DPC++ CUDA - P100" & Implementation.and.Backend!="hipSYCL CUDA - P100" & Implementation.and.Backend!="hipSYCL ROCm - gfx906")

#having "Gold" in the names of the backend is redundant -- this is only showing the CPU device for cache misses!
matmul$Implementation.and.Backend <- sub(" - Gold$", "", matmul$Implementation.and.Backend)

p1 <- ggplot(matmul, aes(x=Local.Size, y=Missed...of.all.Cache.References,colour=Version,group=interaction(Local.Size,Version))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6),legend.position="bottom") + expand_limits(y = 0) + labs(colour = "SYCL Parallel Construct", x="Local Size", y="Missed (%) of Cache References") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + expand_limits(y = 0, x=1024) + facet_wrap( ~ Implementation.and.Backend, strip.position = "top", scales = "free_x", nrow=1) + geom_hline(yintercept=median(subset(matmul, version == "Serial")$Runtime), linetype="dashed", color = "blue")
ggsave('matmul-cpu-local-cache-miss.pdf', p1, device="pdf",width=11.7, height=4.15, units="in",dpi=320)
print(p1)

SYCL-Bench Figures

In [363]:
wgp_fp['Benchmark name'] = wgp_fp['Benchmark name'].str.replace("_NDRange","")
hdp['Benchmark name'] = hdp['Benchmark name'].str.replace("_Hierarchical","")
hdp['Benchmark name'] = hdp['Benchmark name'].str.replace("_HierarchicalParallelFor","")
task['Benchmark name'] = task['Benchmark name'].str.replace("_SingleTask","")
sync['Benchmark name'] = sync['Benchmark name'].str.replace("_NDRange","")
In [383]:
%%R -i bkp_fp32 -i wgp_fp -i hdp -i task -i sync -i colour_scale -h 11.7 -w 8.3 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(bkp_fp32) <- make.names(names(bkp_fp32), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(bkp_fp32, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
#ggsave('bkp-float.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)

names(wgp_fp) <- make.names(names(wgp_fp), unique = FALSE, allow_ = TRUE)
wgp_fp$data.type.width <- reorder(wgp_fp$data.type.width, as.numeric(wgp_fp$data.type.width))
wgp_fp$Benchmark.name <- gsub("_NDRange", "", wgp_fp$Benchmark.name)
wgp_fp <- subset(wgp_fp, data.type.width=="32")
levels(wgp_fp$data.type.width) <- paste("fp",levels(wgp_fp$data.type.width),sep='')
p2 <- ggplot(wgp_fp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

names(hdp) <- make.names(names(hdp), unique = FALSE, allow_ = TRUE)
p3 <- ggplot(hdp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale + scale_x_discrete(limits = rev(unique(hdp$Benchmark.name)))

names(task) <- make.names(names(task), unique = FALSE, allow_ = TRUE)
p4 <- ggplot(task, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="SYCL-Bench Application", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

names(sync) <- make.names(names(sync), unique = FALSE, allow_ = TRUE)
p5 <- ggplot(sync, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="SYCL-Bench Application", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none", axis.title.x=element_blank()),
                p2 + theme(legend.position="none", axis.title.x=element_blank()),
                p3 + theme(legend.position="none", axis.title.x=element_blank()),
                p4 + theme(legend.position="none", axis.title.x=element_blank()),
                p5 + theme(legend.position="none", axis.title.x=element_blank()),
                align = 'vh', hjust = -2, ncol = 2)

pg <- pg + draw_plot_label(label=c("a)", "b)", "c)", "d)","e)"),
                           x=c(0.28,0.8,0.28,0.8,0.28),
                           y=c(0.69,0.69,0.36,0.36,0.025),
                           hjust=.5, vjust=.5, size=12)

#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(ncol = 1,title="SYCL Runtime")) + theme(legend.position = "right"))
#bp <- plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)

pg <- pg + draw_grob(legend, 0.75, -0.3, 0, 1)
print(pg)
ggsave('sycl-bench-res.pdf', pg, device="pdf",height=11.7, width=8.3, units="in",dpi=320)
In [422]:
%%R -i bkp_bandw -i bkp_block512 -i colour_scale -w 11.7 -h 4.15 --units in -r 200

outlier_size = 0.10

library('ggplot2')
library('latex2exp')

names(bkp_block512) <- make.names(names(bkp_block512), unique = FALSE, allow_ = TRUE)
bkp_block512$blocksize <- reorder(bkp_block512$blocksize, as.numeric(bkp_block512$blocksize))
p0 <- ggplot(bkp_block512, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale

names(bkp_bandw) <- make.names(names(bkp_bandw), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(bkp_bandw, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale


library('cowplot')
pg <- plot_grid(p0 + theme(legend.position="none"),
                p1 + theme(legend.position="none",plot.margin=unit(c(0,0,0.5,0),"cm")),
                align = 'vh', hjust = -3, ncol = 2)
pg <-pg + draw_plot_label(label=c("a)", "b)"),
                      x=c(0.28,0.8),
                      y=c(0.025,0.025),
                      hjust=.5, vjust=.5, size=12)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(ncol = 1,title="SYCL Runtime")) + theme(legend.position = "right"))
bg <- plot_grid(pg, legend, rel_widths = c(0.75, .2),ncol=2)

#pg <- pg + draw_grob(legend, 0.75, -0.3, 0, 1)
#print(pg)
print(bg)
ggsave('microbench.pdf', bg, device="pdf",width=11.7, height=4.15, units="in",dpi=320)
In [ ]: